From a17bfc05cc127dd8d3922d7b79b7ff4754893d49 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Tue, 28 Jun 2022 16:05:18 -0700 Subject: [PATCH 001/111] [Relay] CaptureIndexInSpans debugging pass (#11926) * [Relay] CaptureIndexInSpans debugging pass This pass will update (most) expression nodes to capture their post-dfs indexes. That makes it easy to connect pretty-printed fragments back to the overall model, and is very handy for Collage which uses post-dfs indexes extensively. * - rename - add header decl --- include/tvm/relay/transform.h | 11 ++ python/tvm/relay/transform/transform.py | 19 +++ .../capture_postdfsindex_in_spans.cc | 134 ++++++++++++++++++ .../test_capture_postdfsindex_in_spans.py | 91 ++++++++++++ 4 files changed, 255 insertions(+) create mode 100644 src/relay/transforms/capture_postdfsindex_in_spans.cc create mode 100644 tests/python/relay/transform/test_capture_postdfsindex_in_spans.py diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index 1fef02557e098..042ad1ef02da9 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -569,6 +569,17 @@ TVM_DLL Pass FlattenAtrousConv(); */ TVM_DLL Pass AnnotateUsedMemory(); +/*! + * \brief Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in + * their span, in the form "index::". This is useful for + * debugging since a) it helps identify pretty-printed sub-expressions within the overall model + * and b) the indexes are heavily used by Collage for its compact representation of sub-graphs. + * + * Note that Op and Constructor nodes are not changed even though they are assigned an + * post-dfs index. + */ +TVM_DLL Pass CapturePostDfsIndexInSpans(); + } // namespace transform /*! diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index 979664f72ca39..c931289d40c60 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -1422,6 +1422,25 @@ def MarkCompilerFunctionsAsExtern(compiler_filter=""): return _ffi_api.MarkCompilerFunctionsAsExtern(compiler_filter) +def CapturePostDfsIndexInSpans(): + """Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in + their span, in the form "index::". + + This is useful for debugging since a) it helps identify pretty-printed sub-expressions within + the overall model and b) the indexes are heavily used by Collage for its compact representation + of sub-graphs. + + Note that Op and Constructor nodes are not changed even though they are assigned an + post-dfs index. + + Returns + ------- + ret : tvm.transform.Pass + The pass. + """ + return _ffi_api.CapturePostDfsIndexInSpans() + + def InlineCompilerFunctionsBoundTo(global_vars): """Inlines all global functions bound to a global var in global_vars. diff --git a/src/relay/transforms/capture_postdfsindex_in_spans.cc b/src/relay/transforms/capture_postdfsindex_in_spans.cc new file mode 100644 index 0000000000000..17c7e59c7f605 --- /dev/null +++ b/src/relay/transforms/capture_postdfsindex_in_spans.cc @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/relay/transform/capture_index_in_spans.cc + * \brief A pass to set spans to capture the post-dfs index of every node. + */ + +#include +#include + +#include "../ir/indexed_graph.h" + +namespace tvm { +namespace relay { +namespace transform { + +namespace { + +/*! \brief Update all the spans to capture their post-dfs index. */ +class SpansRewriter : public ExprRewriter { + public: + explicit SpansRewriter(const IndexedGraph* indexed_graph) + : source_name_(SourceName::Get("index")), indexed_graph_(indexed_graph) {} + + private: + Expr Rewrite_(const VarNode* var_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, MakeSpan(GetRef(var_node))); + } + + Expr Rewrite_(const GlobalVarNode* global_var_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, + MakeSpan(GetRef(global_var_node))); + } + + Expr Rewrite_(const ConstantNode* constant_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, MakeSpan(GetRef(constant_node))); + } + + Expr Rewrite_(const TupleNode* tuple_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, MakeSpan(GetRef(tuple_node))); + } + + Expr Rewrite_(const FunctionNode* function_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, {}, {}, {}, + MakeSpan(GetRef(function_node))); + } + + Expr Rewrite_(const CallNode* call_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, {}, {}, MakeSpan(GetRef(call_node))); + } + + Expr Rewrite_(const LetNode* let_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, {}, MakeSpan(GetRef(let_node))); + } + + Expr Rewrite_(const IfNode* if_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, {}, MakeSpan(GetRef(if_node))); + } + + // OpNodes are not rewritten. + + Expr Rewrite_(const TupleGetItemNode* tuple_get_item_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, + MakeSpan(GetRef(tuple_get_item_node))); + } + + Expr Rewrite_(const RefCreateNode* ref_create_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, + MakeSpan(GetRef(ref_create_node))); + } + + Expr Rewrite_(const RefReadNode* ref_read_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, MakeSpan(GetRef(ref_read_node))); + } + + Expr Rewrite_(const RefWriteNode* ref_write_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, + MakeSpan(GetRef(ref_write_node))); + } + + // ConstructorNodes are not rewritten. + + Expr Rewrite_(const MatchNode* match_node, const Expr& post) final { + return WithFields(Downcast(post), {}, {}, {}, MakeSpan(GetRef(match_node))); + } + + Span MakeSpan(const Expr& expr) { + auto node = indexed_graph_->item_to_node(expr); + int node_index = static_cast(node->index_); + int dominator_index = + node->dominator_parent_ ? static_cast(node->dominator_parent_->index_) : -1; + Span span(source_name_, /*line=*/node_index, /*end_line=*/node_index, + /*column=*/dominator_index, /*end_column=*/dominator_index); + return span; + } + + SourceName source_name_; + const IndexedGraph* indexed_graph_; +}; + +} // namespace + +tvm::transform::Pass CapturePostDfsIndexInSpans() { + auto pass_func = [](Function f, IRModule m, transform::PassContext ctxt) { + std::unique_ptr> indexed_graph = CreateIndexedGraph(f); + SpansRewriter rewriter(indexed_graph.get()); + return Downcast(PostOrderRewrite(f, &rewriter)); + }; + return CreateFunctionPass(pass_func, 0, "CapturePostDfsIndexInSpans", {}); +} + +TVM_REGISTER_GLOBAL("relay._transform.CapturePostDfsIndexInSpans") + .set_body_typed(CapturePostDfsIndexInSpans); + +} // namespace transform +} // namespace relay +} // namespace tvm diff --git a/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py b/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py new file mode 100644 index 0000000000000..16a7bd447992b --- /dev/null +++ b/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py @@ -0,0 +1,91 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License +"""Unit tests for the CapturePostDfsIndexInSpans debugging pass.""" + +import tvm +import tvm.testing +import numpy as np + + +def make_const(dtype, shape): + return tvm.relay.const(np.random.rand(*shape).astype(dtype)) + + +def make_consts(dtype, shapes): + return [make_const(dtype, shape) for shape in shapes] + + +metatable = { + "relay.Constant": make_consts( + "float16", + [ + (2304, 768), # 0 + (2304,), # 1 + (600, 32, 64), # 2 + ], + ) +} + + +def input_mod(): + return tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) { + %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304); + %1 = add(%0, meta[relay.Constant][1]); + %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16], + Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] { + %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16], + PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] { + nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True) + }; + %6(%y_3_i0, %y_3_i1) + }; + %3 = %2(%x3, meta[relay.Constant][2]); + (%1, %3) + } + """, + "from_string", + None, + metatable, + ) + + +expected_pretty_printed_output_mod = r"""def @main(%x0: Tensor[(1600, 768), float16] /* ty=Tensor[(1600, 768), float16] span=index:0:5 */, %x3: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:1:18 */) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) { + %0 = nn.dense(%x0, meta[relay.Constant][0] /* ty=Tensor[(2304, 768), float16] span=index:4:5 */, units=2304) /* ty=Tensor[(1600, 2304), float16] span=index:5:7 */; + %2 = fn (%y_3_i0: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:8:15 */, %y_3_i1: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:9:15 */, Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] { + %1 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:10:13 */, %FunctionVar_0_11: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:11:13 */, PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] { + nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True) /* ty=Tensor[(600, 32, 32), float16] span=index:13:14 */ + } /* ty=fn (Tensor[(600, 32, 64), float16], Tensor[(600, 32, 64), float16]) -> Tensor[(600, 32, 32), float16] span=index:14:15 */; + %1(%y_3_i0, %y_3_i1) /* ty=Tensor[(600, 32, 32), float16] span=index:15:16 */ + } /* ty=fn (Tensor[(600, 32, 64), float16], Tensor[(600, 32, 64), float16]) -> Tensor[(600, 32, 32), float16] span=index:16:18 */; + %3 = add(%0, meta[relay.Constant][1] /* ty=Tensor[(2304), float16] span=index:6:7 */) /* ty=Tensor[(1600, 2304), float16] span=index:7:19 */; + %4 = %2(%x3, meta[relay.Constant][2] /* ty=Tensor[(600, 32, 64), float16] span=index:17:18 */) /* ty=Tensor[(600, 32, 32), float16] span=index:18:19 */; + (%3, %4) /* ty=(Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) span=index:19:20 */ +} + +""" + + +def test_capture_index_in_spans(): + output_mod = str(tvm.relay.transform.CapturePostDfsIndexInSpans()(input_mod())) + assert output_mod == expected_pretty_printed_output_mod + + +if __name__ == "__main__": + tvm.testing.main() From 892ab131129da96e142bcdee48e41757560e686b Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Tue, 28 Jun 2022 16:05:36 -0700 Subject: [PATCH 002/111] Move jenkins/ dir into ci/jenkins and spread docs around. (#11927) --- Jenkinsfile | 2 +- ci/README.md | 97 ++++++++++++++ ci/jenkins/.gitignore | 1 + {jenkins => ci/jenkins}/Build.groovy.j2 | 0 {jenkins => ci/jenkins}/Deploy.groovy.j2 | 0 {jenkins => ci/jenkins}/DockerBuild.groovy.j2 | 0 {jenkins => ci/jenkins}/Jenkinsfile.j2 | 14 +-- {jenkins => ci/jenkins}/Lint.groovy.j2 | 0 ci/jenkins/Makefile | 27 ++++ {jenkins => ci/jenkins}/Prepare.groovy.j2 | 0 {jenkins => ci/jenkins}/README.md | 117 +++-------------- {jenkins => ci/jenkins}/Test.groovy.j2 | 0 {jenkins => ci/jenkins}/generate.py | 8 +- {jenkins => ci/jenkins}/macros.j2 | 0 {jenkins => ci/jenkins}/requirements.txt | 0 docs/contribute/ci.rst | 119 ++++++++++++++++-- docs/contribute/code_guide.rst | 17 +++ tests/scripts/open_docker_update_pr.py | 4 +- tests/scripts/task_lint.sh | 3 +- 19 files changed, 288 insertions(+), 121 deletions(-) create mode 100644 ci/README.md create mode 100644 ci/jenkins/.gitignore rename {jenkins => ci/jenkins}/Build.groovy.j2 (100%) rename {jenkins => ci/jenkins}/Deploy.groovy.j2 (100%) rename {jenkins => ci/jenkins}/DockerBuild.groovy.j2 (100%) rename {jenkins => ci/jenkins}/Jenkinsfile.j2 (93%) rename {jenkins => ci/jenkins}/Lint.groovy.j2 (100%) create mode 100644 ci/jenkins/Makefile rename {jenkins => ci/jenkins}/Prepare.groovy.j2 (100%) rename {jenkins => ci/jenkins}/README.md (62%) rename {jenkins => ci/jenkins}/Test.groovy.j2 (100%) rename {jenkins => ci/jenkins}/generate.py (96%) rename {jenkins => ci/jenkins}/macros.j2 (100%) rename {jenkins => ci/jenkins}/requirements.txt (100%) diff --git a/Jenkinsfile b/Jenkinsfile index 3f82ff184013d..07c7f0c44aa19 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-06-22T10:07:00.173803 +// Generated at 2022-06-27T17:30:37.779354 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> diff --git a/ci/README.md b/ci/README.md new file mode 100644 index 0000000000000..a5cb39016b135 --- /dev/null +++ b/ci/README.md @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + +# Apache TVM Continuous Integration (CI) + +## Overview + +TVM's Continuous Integration is responsible for verifying the code in `apache/tvm` and testing PRs +before they merge to inform TVM contributors and committers. These jobs are essential to keeping the +TVM project in a healthy state and preventing breakages. CI in TVM is broken into these pieces: + - Lint scripts in [`tests/lint`](../tests/lint). + - The tests themselves, all of which live underneath [`tests`](../tests). + - Definitions of test suites, with each suite defined as a separate `task_` script in + [`tests/scripts`](../tests/scripts). + - The linux test sequence (in [`Jenkinsfile`](../Jenkinsfile)), which lints and builds TVM and runs test + suites using Docker on Linux. + - The Windows and Mac test sequences (in [`.github/actions`](../.github/actions)). + - GitHub Actions that support the code review process (in [`.github/actions`](../.github/actions)). + - Tools to reproduce the CI locally (in `tests/scripts`). + - Infrastructure-as-Code that configures the cloud services that provide Jenkins for the TVM CI (in the + [`tlc-pack/ci`](https://github.com/tlc-pack/ci) repo). + +## CI Documentation Index + +The CI documentation belongs with the implementation it describes. To make that concrete, the +documentation is split like so: +1. An overview of the CI is in this file. +1. User-facing documentation lives in `apache/tvm`'s `docs/contribute` sub-directory and is served on the + [TVM docs site](https://tvm.apache.org/docs/contribute/ci.html). +2. Documentation of the tools that run TVM's various regression tests locally and the test suites + are in this sub-directory. +3. Documentation of the cloud services and their configuration lives in the + [`tlc-pack/ci`](https://github.com/tlc-pack/ci) repo. + +## Jenkins + +Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds). + +## GitHub Actions + +GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub automations. These are defined in [`.github/workflows`](../.github/workflows/). These automations include bots to: +* [cc people based on subscribed teams/topics](https://github.com/apache/tvm/issues/10317) +* [allow non-committers to merge approved / CI passing PRs](https://discuss.tvm.apache.org/t/rfc-allow-merging-via-pr-comments/12220) +* [add cc-ed people as reviewers on GitHub](https://discuss.tvm.apache.org/t/rfc-remove-codeowners/12095) +* [ping languishing PRs after no activity for a week (currently opt-in only)](https://github.com/apache/tvm/issues/9983) +* [push a `last-successful` branch to GitHub with the last `main` commit that passed CI](https://github.com/apache/tvm/tree/last-successful) + +https://github.com/apache/tvm/actions has the logs for each of these workflows. Note that when debugging these workflows changes from PRs from forked repositories won't be reflected in the PR. These should be tested in the forked repository first and linked in the PR body. + +## Docker Images + +Each CI job runs most of its work inside a Docker container, built from files +in the [`docker/`](../docker) folder. These +files are built nightly in Jenkins via the [docker-images-ci](https://ci.tlcpack.ai/job/docker-images-ci/>) job. +The images for these containers are hosted in the [tlcpack Docker Hub](https://hub.docker.com/u/tlcpack>) +and referenced in the [`Jenkinsfile.j2`](Jenkinsfile.j2). These can be inspected and run +locally via standard Docker commands. + +### `ci-docker-staging` + +The [ci-docker-staging](https://github.com/apache/tvm/tree/ci-docker-staging>) +branch is used to test updates to Docker images and `Jenkinsfile` changes. When +running a build for a normal PR from a forked repository, Jenkins uses the code +from the PR except for the `Jenkinsfile` itself, which comes from the base branch. +When branches are built, the `Jenkinsfile` in the branch is used, so a committer +with write access must push PRs to a branch in apache/tvm to properly test +`Jenkinsfile` changes. If your PR makes changes to the `Jenkinsfile`, make sure +to @ a [committer](../CONTRIBUTORS.md>) +and ask them to push your PR as a branch to test the changes. + +# Jenkins CI + +TVM uses Jenkins for running Linux continuous integration (CI) tests on +[branches](https://ci.tlcpack.ai/job/tvm/) and +[pull requests](https://ci.tlcpack.ai/job/tvm/view/change-requests/) through a +build configuration specified in a [`Jenkinsfile`](../Jenkinsfile). +Other jobs run in GitHub Actions for Windows and MacOS jobs. + +## `Jenkinsfile` + +The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches. + +To regenerate the `Jenkinsfile`, run `make` in the `ci/jenkins` dir. diff --git a/ci/jenkins/.gitignore b/ci/jenkins/.gitignore new file mode 100644 index 0000000000000..187a72392cc8c --- /dev/null +++ b/ci/jenkins/.gitignore @@ -0,0 +1 @@ +/_venv \ No newline at end of file diff --git a/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2 similarity index 100% rename from jenkins/Build.groovy.j2 rename to ci/jenkins/Build.groovy.j2 diff --git a/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2 similarity index 100% rename from jenkins/Deploy.groovy.j2 rename to ci/jenkins/Deploy.groovy.j2 diff --git a/jenkins/DockerBuild.groovy.j2 b/ci/jenkins/DockerBuild.groovy.j2 similarity index 100% rename from jenkins/DockerBuild.groovy.j2 rename to ci/jenkins/DockerBuild.groovy.j2 diff --git a/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 similarity index 93% rename from jenkins/Jenkinsfile.j2 rename to ci/jenkins/Jenkinsfile.j2 index 0a83549da1477..6f2f6a437044d 100644 --- a/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -48,7 +48,7 @@ // Generated at {{ generated_time }} import org.jenkinsci.plugins.pipeline.modeldefinition.Utils -{% import 'jenkins/macros.j2' as m with context -%} +{% import 'ci/jenkins/macros.j2' as m with context -%} // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e' @@ -106,12 +106,12 @@ s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBE // General note: Jenkins has limits on the size of a method (or top level code) // that are pretty strict, so most usage of groovy methods in these templates // are purely to satisfy the JVM -{% include "jenkins/Prepare.groovy.j2" %} -{% include "jenkins/DockerBuild.groovy.j2" %} -{% include "jenkins/Lint.groovy.j2" %} -{% include "jenkins/Build.groovy.j2" %} -{% include "jenkins/Test.groovy.j2" %} -{% include "jenkins/Deploy.groovy.j2" %} +{% include "ci/jenkins/Prepare.groovy.j2" %} +{% include "ci/jenkins/DockerBuild.groovy.j2" %} +{% include "ci/jenkins/Lint.groovy.j2" %} +{% include "ci/jenkins/Build.groovy.j2" %} +{% include "ci/jenkins/Test.groovy.j2" %} +{% include "ci/jenkins/Deploy.groovy.j2" %} cancel_previous_build() diff --git a/jenkins/Lint.groovy.j2 b/ci/jenkins/Lint.groovy.j2 similarity index 100% rename from jenkins/Lint.groovy.j2 rename to ci/jenkins/Lint.groovy.j2 diff --git a/ci/jenkins/Makefile b/ci/jenkins/Makefile new file mode 100644 index 0000000000000..5c9e0ac540578 --- /dev/null +++ b/ci/jenkins/Makefile @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +_venv: requirements.txt + rm -rf _venv + python3 -mvenv _venv + _venv/bin/pip3 install -r requirements.txt + +all: _venv + _venv/bin/python3 generate.py + +.PHONY: all venv +.DEFAULT_GOAL=all diff --git a/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2 similarity index 100% rename from jenkins/Prepare.groovy.j2 rename to ci/jenkins/Prepare.groovy.j2 diff --git a/jenkins/README.md b/ci/jenkins/README.md similarity index 62% rename from jenkins/README.md rename to ci/jenkins/README.md index d06672518ac26..d2a29838b6d5c 100644 --- a/jenkins/README.md +++ b/ci/jenkins/README.md @@ -17,7 +17,11 @@ # TVM CI -TVM runs CI jobs on every commit to an open pull request and to branches in the apache/tvm repo (such as `main`). These jobs are essential to keeping the TVM project in a healthy state and preventing breakages. Jenkins does most of the work in running the TVM tests, though some smaller jobs are also run on GitHub Actions. +TVM runs CI jobs on every commit to an open pull request and to branches in the apache/tvm repo (such as `main`). These jobs are essential to keeping the TVM project in a healthy state and preventing breakages. + +## Jenkins + +Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds). ## GitHub Actions @@ -33,17 +37,20 @@ https://github.com/apache/tvm/actions has the logs for each of these workflows. ## Keeping CI Green -Developers rely on the TVM CI to get signal on their PRs before merging. -Occasionally breakages slip through and break `main`, which in turn causes -the same error to show up on an PR that is based on the broken commit(s). Broken -commits can be identified [through GitHub](https://github.com/apache/tvm/commits/main>) -via the commit status icon or via [Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>). -In these situations it is possible to either revert the offending commit or -submit a forward fix to address the issue. It is up to the committer and commit -author which option to choose, keeping in mind that a broken CI affects all TVM -developers and should be fixed as soon as possible. +Developers rely on the TVM CI to get signal on their PRs before merging. Occasionally breakages +slip through and break `main`, which in turn causes the same error to show up on an unrelated PR +that is based on the broken commit(s). Broken commits can be identified [through +GitHub](https://github.com/apache/tvm/commits/main>) via the commit status icon or via +[Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>). In these +situations it is possible to either revert the offending commit or submit a forward fix to address +the issue. It is up to the committer and commit author which option to choose. A broken CI affects +all TVM developers and should be fixed as soon as possible, while a revert may be especially painful +for the author of the offending PR when that PR is large. -Some tests are also flaky and fail for reasons unrelated to the PR. The [CI monitoring rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix and re-enable the test. +Some tests are also flaky and occasionally fail for reasons unrelated to the PR. The [CI monitoring +rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and +disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix +and re-enable the test. ## Dealing with Flakiness @@ -85,7 +92,7 @@ a name, hash, and path in S3, using the `workflow_dispatch` event on The sha256 must match the file or it will not be uploaded. The upload path is user-defined so it can be any path (no trailing or leading slashes allowed) but be careful not to collide with existing resources on accident. - + ## Skipping CI For reverts and trivial forward fixes, adding `[skip ci]` to the revert's @@ -153,88 +160,4 @@ _venv/bin/python3 jenkins/generate.py # Infrastructure -Jenkins runs in AWS on an EC2 instance fronted by an ELB which makes it available at https://ci.tlcpack.ai. These definitions are declared via Terraform in the [tlc-pack/ci-terraform](https://github.com/tlc-pack/ci-terraform) repository. The Terraform code references custom AMIs built in [tlc-pack/ci-packer](https://github.com/tlc-pack/ci-packer). [tlc-pack/ci](https://github.com/tlc-pack/ci) contains Ansible scripts to deploy the Jenkins head node and set it up to interact with AWS. - -The Jenkins head node has a number of autoscaling groups with labels that are used to run jobs (e.g. `CPU`, `GPU` or `ARM`) via the [EC2 Fleet](https://plugins.jenkins.io/ec2-fleet/) plugin. - -## Deploying - -Deploying Jenkins can disrupt developers so it must be done with care. Jobs that are in-flight will be cancelled and must be manually restarted. Follow the instructions [here](https://github.com/tlc-pack/ci/issues/10) to run a deploy. - -## Monitoring - -Dashboards of CI data can be found: -* within Jenkins at https://ci.tlcpack.ai/monitoring (HTTP / JVM stats) -* at https://monitoring.tlcpack.ai (job status, worker status) - -## CI Diagram - -This details the individual parts that interact in TVM's CI. For details on operations, see https://github.com/tlc-pack/ci. - -```mermaid -graph TD - Commit --> GitHub - GitHub --> |`push` webhook| WebhookServer(Webhook Server) - JobExecutor(Job Executor) - WebhookServer --> JobExecutor - JobExecutor --> EC2Fleet(EC2 Fleet Plugin) - EC2Fleet --> |capacity request| EC2(EC2 Autoscaler) - JobExecutor --> WorkerEC2Instance - Docker --> |build cache, artifacts| S3 - WorkerEC2Instance --> Docker - Docker --> |docker pull| G(Docker Hub) - Docker --> |docker push / pull| ECR - Docker --> |Execute jobs| CIScripts(CI Scripts) - RepoCITerraform(ci-terraform repo) --> |terraform| ECR - RepoCITerraform(ci-terraform repo) --> |terraform| EC2 - RepoCITerraform(ci-terraform repo) --> |terraform| S3 - RepoCI(ci repo) --> |configuration via Ansible| WorkerEC2Instance - RepoCIPacker(ci-packer) --> |AMIs| EC2 - Monitoring_Scrapers(Jenkins Scraper) --> Monitoring_DB(Postrgres) - Grafana --> Monitoring_DB - GitHub --> Windows - GitHub --> MacOS - - Developers --> |check PR status|JenkinsUI(Jenkins Web UI) - Monitoring_Scrapers --> |fetch job data| JenkinsUI - Developers --> |git push| Commit - Developers --> |create PR| GitHub - - subgraph Jenkins Head Node - WebhookServer - JobExecutor - EC2Fleet - JenkinsUI - end - - subgraph GitHub Actions - Windows - MacOS - end - - subgraph Configuration / Terraform - RepoCITerraform - RepoCI - RepoCIPacker - end - - subgraph Monitoring - Monitoring_DB - Grafana - Monitoring_Scrapers - end - - subgraph AWS - subgraph Jenkins Workers - WorkerEC2Instance(Worker EC2 Instance) - subgraph "Worker EC2 Instance" - Docker - CIScripts - end - end - EC2 - ECR - S3 - end - -``` +While all TVM tests are contained within the apache/tvm repository, the infrastructure used to run the tests is donated by the TVM Community. To encourage collaboration, the configuration for TVM's CI infrastructure is stored in a public GitHub repository. TVM community members are encouraged to contribute improvements. The configuration, along with documentation of TVM's CI infrastructure, is in the [tlc-pack/ci](https://github.com/tlc-pack/ci) repo. diff --git a/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2 similarity index 100% rename from jenkins/Test.groovy.j2 rename to ci/jenkins/Test.groovy.j2 diff --git a/jenkins/generate.py b/ci/jenkins/generate.py similarity index 96% rename from jenkins/generate.py rename to ci/jenkins/generate.py index ba7f165925133..686e44e14dd52 100644 --- a/jenkins/generate.py +++ b/ci/jenkins/generate.py @@ -25,8 +25,8 @@ from pathlib import Path -REPO_ROOT = Path(__file__).resolve().parent.parent -JENKINSFILE_TEMPLATE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2" +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +JENKINSFILE_TEMPLATE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2" JENKINSFILE = REPO_ROOT / "Jenkinsfile" @@ -111,10 +111,10 @@ def lines_without_generated_tag(content): Newly generated Jenkinsfile did not match the one on disk! If you have made edits to the Jenkinsfile, move them to 'jenkins/Jenkinsfile.j2' and regenerate the Jenkinsfile from the template with - + python3 -m pip install -r jenkins/requirements.txt python3 jenkins/generate.py - + Diffed changes: """ ).strip() diff --git a/jenkins/macros.j2 b/ci/jenkins/macros.j2 similarity index 100% rename from jenkins/macros.j2 rename to ci/jenkins/macros.j2 diff --git a/jenkins/requirements.txt b/ci/jenkins/requirements.txt similarity index 100% rename from jenkins/requirements.txt rename to ci/jenkins/requirements.txt diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst index 0cc1bf9dd992b..9a2876220fc7e 100644 --- a/docs/contribute/ci.rst +++ b/docs/contribute/ci.rst @@ -23,14 +23,21 @@ Using TVM's CI .. contents:: :local: -TVM uses Jenkins for running Linux continuous integration (CI) tests on -`branches `_ and +TVM primarily uses Jenkins for running Linux continuous integration (CI) tests on +`branches `_ `pull requests `_ through a build configuration specified in a `Jenkinsfile `_. -Non-critical jobs run in GitHub Actions for Windows and MacOS jobs. +Jenkins is the only CI step that is codified to block merging. TVM is also tested minimally +against Windows and MacOS using GitHub Actions. + +This page describes how contributors and committers can use TVM's CI to verify their code. You can +read more about the design of TVM CI in the + +For Contributors +---------------- A standard CI run looks something like this viewed in `Jenkins' BlueOcean viewer `_. -CI runs usually take several hours to complete and pull requests (PRs) cannot be merged before CI +CI runs usually take a couple hours to complete and pull requests (PRs) cannot be merged before CI has successfully completed. To diagnose failing steps, click through to the failing pipeline stage then to the failing step to see the output logs. @@ -40,12 +47,12 @@ pipeline stage then to the failing step to see the output logs. Debugging Failures -****************** +^^^^^^^^^^^^^^^^^^ When CI fails for some reason, there are several methods to diagnose the issue. Jenkins Logs ------------- +"""""""""""" .. |pytest| replace:: ``pytest`` .. _pytest: https://docs.pytest.org/en/6.2.x/ @@ -59,13 +66,109 @@ the failing job to view the logs. Note: need to scroll up to view the actual failure. Reproduce Failures ------------------- +"""""""""""""""""" Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`. Reporting Issues -**************** +^^^^^^^^^^^^^^^^ Issues with CI should be `reported on GitHub `_ with a link to the relevant jobs, commits, or PRs. + + + +For Maintainers +--------------- + +This section discusses processes ran by TVM Maintainers. + + +Procedures for Keeping CI Green +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section talks about common procedures used to keep CI passing. + +Broken CI due to Simultaneous Merge +""""""""""""""""""""""""""""""""""" + +Developers rely on the TVM CI to get signal on their PRs before merging. Occasionally, two +different PRs can pass CI individually but break ``main`` when both land. This in turn causes an +error to show up on an unrelated PR that is based on the broken commit(s). Broken commits can be +identified `through GitHub `_ via the commit status icon +or via `Jenkins `_. + +In these situations it is ultimately the responsibility of the TVM Committer who merged the PR to +fix CI (others are encouraged to help). Typical responses to this situation are: +1. revert the offending commit +2. submit a forward fix to address the issue. + +It is up to the committer and commit author which option to choose. A broken CI affects all TVM +developers and should be fixed as soon as possible, while a revert may be especially painful for the +author of the offending PR when that PR is large. + + +Dealing with Flakiness +^^^^^^^^^^^^^^^^^^^^^^ + +If you notice a failure on your PR that seems unrelated to your change, you should +search [recent GitHub issues related to flaky tests](https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>) and +[file a new issue](https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>) +if you don't see any reports of the failure. If a certain test or class of tests affects +several PRs or commits on `main` with flaky failures, the test should be disabled via +[pytest's @xfail decorator](https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail) with [`strict=False`](https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter) and the relevant issue linked in the +disabling PR. + +.. code-block:: python + + @pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234") + def test_something_flaky(): + pass + +Then submit a PR as usual + +.. code-block:: bash + + git add + git commit -m'[skip ci][ci] Disable flaky test: ```` + + See # + ' + gh pr create + + +Skipping CI +^^^^^^^^^^^ + +For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's +PR title will cause CI to shortcut and only run lint. Committers should +take care that they only merge CI-skipped PRs to fix a failure on ``main`` and +not in cases where the submitter wants to shortcut CI to merge a change faster. +The PR title is checked when the build is first run (specifically during the lint +step, so changes after that has run do not affect CI and will require the job to +be re-triggered by another ``git push``). + +.. code-block:: bash + + # Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of + # the commit subject + git revert HEAD + git checkout -b my_fix + # After you have pushed your branch, create a PR as usual. + git push my_repo + # Example: Skip CI on a branch with an existing PR + # Adding this commit to an existing branch will cause a new CI run where + # Jenkins is skipped + git commit --allow-empty --message "[skip ci] Trigger skipped CI" + git push my_repo + + + +CI Monitoring Rotation +^^^^^^^^^^^^^^^^^^^^^^ + +Some tests are also flaky and occasionally fail for reasons unrelated to the PR. The +`CI monitoring rotation `_ watches for these failures and +disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix +and re-enable the test. diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst index 3849b795f667f..d404ba63794c4 100644 --- a/docs/contribute/code_guide.rst +++ b/docs/contribute/code_guide.rst @@ -139,6 +139,23 @@ If you want your test to run over a variety of targets, use the :py:func:`tvm.te will run ``test_mytest`` with ``target="llvm"``, ``target="cuda"``, and few others. This also ensures that your test is run on the correct hardware by the CI. If you only want to test against a couple targets use ``@tvm.testing.parametrize_targets("target_1", "target_2")``. If you want to test on a single target, use the associated decorator from :py:func:`tvm.testing`. For example, CUDA tests use the ``@tvm.testing.requires_cuda`` decorator. + +Network Resources +----------------- + +In CI, downloading files from the Internet is a big source of flaky test failures (e.g. remote +server can go down or be slow), so try to avoid using the network at all during tests. In some cases +this isn't a reasonable proposition (e.g. the docs tutorials which need to download models). + +In these cases you can re-host files in S3 for fast access in CI. A committer can upload a file, +specified by a name, hash, and path in S3, using the `workflow_dispatch` event on `the +upload_ci_resource.yml GitHub Actions workflow +`_. The sha256 must match +the file or it will not be uploaded. The upload path is user-defined so it can be any path (no +trailing or leading slashes allowed) but be careful not to collide with existing resources on +accident. + + Handle Integer Constant Expression ---------------------------------- We often need to handle constant integer expressions in TVM. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time. diff --git a/tests/scripts/open_docker_update_pr.py b/tests/scripts/open_docker_update_pr.py index 2f85a50461028..f583f00d5cbbe 100755 --- a/tests/scripts/open_docker_update_pr.py +++ b/tests/scripts/open_docker_update_pr.py @@ -28,9 +28,9 @@ from cmd_utils import REPO_ROOT, init_log, Sh from should_rebuild_docker import docker_api -JENKINSFILE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2" +JENKINSFILE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2" GENERATED_JENKINSFILE = REPO_ROOT / "Jenkinsfile" -GENERATE_SCRIPT = REPO_ROOT / "jenkins" / "generate.py" +GENERATE_SCRIPT = REPO_ROOT / "ci" / "jenkins" / "generate.py" GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] BRANCH = "nightly-docker-update" diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh index 80cfc00ff7be4..a05f7ca36bcca 100755 --- a/tests/scripts/task_lint.sh +++ b/tests/scripts/task_lint.sh @@ -32,7 +32,7 @@ function shard1 { tests/scripts/task_convert_scripts_to_python.sh echo "Check Jenkinsfile generation" - python3 jenkins/generate.py --check + python3 ci/jenkins/generate.py --check echo "Checking file types..." python3 tests/lint/check_file_type.py @@ -90,4 +90,3 @@ else shard1 shard2 fi - From ae33f408c4658ad38a959489f3fd5d3b7fc37dfc Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 28 Jun 2022 20:14:11 -0700 Subject: [PATCH 003/111] [MetaSchedule] Refactor MultiLevelTiling state to allow subclassing (#11931) This PR made `State` in `MultiLevelTiling` inherit `Object`, to allow future subclassing of `State`. Making `State` an `Object` allows instances of `State` and its subclasses to be stored in `std::vector`. --- .../schedule_rule/multi_level_tiling.cc | 70 +++++++++++-------- .../schedule_rule/multi_level_tiling.h | 25 +++++-- .../multi_level_tiling_with_intrin.cc | 2 +- 3 files changed, 60 insertions(+), 37 deletions(-) diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc index 07c5ddd7ae70d..28c1a0fdb66e2 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc @@ -61,6 +61,20 @@ using tir::IterVarType; using tir::LoopRV; using tir::Schedule; +State::State(tir::Schedule sch, tir::BlockRV block_rv, Array> tiles) { + ObjectPtr node = make_object(); + node->sch = std::move(sch); + node->block_rv = std::move(block_rv); + node->tiles = std::move(tiles); + data_ = std::move(node); +} + +State StateNode::Copy() const { + ObjectPtr node = make_object(*this); + node->sch = sch->Copy(); + return State(node); +} + // Do nothing; Inherited from ScheduleRuleNode void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context) { if (Optional v = context->target.value()->GetAttr("max_threads_per_block")) { @@ -82,15 +96,15 @@ Array MultiLevelTilingNode::Apply(const Schedule& sch, const BlockRV& Array results; for (auto&& state : ApplySubRules({State(sch, block_rv)})) { - results.push_back(std::move(state.sch)); + results.push_back(std::move(state->sch)); } return results; } std::vector MultiLevelTilingNode::ApplySubRules(std::vector states) { - states = SubRule(std::move(states), [&](State state) { return TileLoopNest(state); }); - states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(state); }); - states = SubRule(std::move(states), [&](State state) { return AddReadReuse(state); }); + states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); }); + states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); }); + states = SubRule(std::move(states), [&](State state) { return AddReadReuse(std::move(state)); }); return states; } @@ -102,53 +116,49 @@ std::vector MultiLevelTilingNode::AddWriteReuse(State state) const { std::vector levels = config.levels; ReuseType req = config.req; if (Optional> ann = tir::GetAnn>( - state.sch->GetSRef(state.block_rv), "meta_schedule.write_cache_level")) { + state->sch->GetSRef(state->block_rv), "meta_schedule.write_cache_level")) { req = ReuseType::kMustReuse; levels = std::vector(ann.value().begin(), ann.value().end()); } std::vector results; if (req == ReuseType::kMayReuse) { // Case 1. If the write cache is already there, we don't need to add another. - Array consumer_rvs = state.sch->GetConsumers(state.block_rv); - if (consumer_rvs.size() == 1 && IsWriteCache(state.sch->GetSRef(consumer_rvs[0]))) { + Array consumer_rvs = state->sch->GetConsumers(state->block_rv); + if (consumer_rvs.size() == 1 && IsWriteCache(state->sch->GetSRef(consumer_rvs[0]))) { for (int level : levels) { - State new_state = state; - new_state.sch = state.sch->Copy(); - new_state.sch->Seed(state.sch->ForkSeed()); - const LoopRV& loop_rv = new_state.tiles[level - 1].back(); - new_state.sch->ReverseComputeAt(consumer_rvs[0], loop_rv, true); + State new_state = state->Copy(); + const LoopRV& loop_rv = new_state->tiles[level - 1].back(); + new_state->sch->ReverseComputeAt(consumer_rvs[0], loop_rv, true); results.push_back(std::move(new_state)); } results.push_back(state); return results; } else { // Case 2. No write cache is added - State new_state(/*sch=*/state.sch->Copy(), /*block_rv=*/state.block_rv); - new_state.sch->Seed(state.sch->ForkSeed()); + State new_state = state->Copy(); results.emplace_back(std::move(new_state)); } } // Case 3. Add one write cache - BlockRV write_cache = state.sch->CacheWrite(/*block_rv=*/state.block_rv, /*read_buffer_index=*/0, - /*storage_scope=*/config.scope); + BlockRV write_cache = + state->sch->CacheWrite(/*block_rv=*/state->block_rv, /*read_buffer_index=*/0, + /*storage_scope=*/config.scope); for (int level : levels) { - State new_state = state; - new_state.sch = state.sch->Copy(); - new_state.sch->Seed(state.sch->ForkSeed()); - const LoopRV& loop_rv = new_state.tiles[level - 1].back(); - new_state.sch->ReverseComputeAt(write_cache, loop_rv, true); + State new_state = state->Copy(); + const LoopRV& loop_rv = new_state->tiles[level - 1].back(); + new_state->sch->ReverseComputeAt(write_cache, loop_rv, true); results.push_back(std::move(new_state)); } return results; } std::vector MultiLevelTilingNode::TileLoopNest(State state) const { - Schedule& sch = state.sch; - const BlockRV& block_rv = state.block_rv; + Schedule& sch = state->sch; + const BlockRV& block_rv = state->block_rv; // Step 1. Assuming trivial binding, pair the loops and their iter-var-types Array loops = sch->GetLoops(block_rv); - std::vector iter_types = GetBlockVarTypes(sch->GetSRef(state.block_rv)); + std::vector iter_types = GetBlockVarTypes(sch->GetSRef(state->block_rv)); ICHECK_EQ(loops.size(), iter_types.size()); // Step 2. For each loop axis, tile it int64_t spatial_loop_product = 1; @@ -192,7 +202,7 @@ std::vector MultiLevelTilingNode::TileLoopNest(State state) const { sch->Bind(fused, tile_binds[i]); tiles[i] = {fused}; } - state.tiles = Array>{tiles.begin(), tiles.end()}; + state->tiles = Array>{tiles.begin(), tiles.end()}; if (this->thread_warp_size_ != -1) { int64_t low_inclusive = 1; int64_t high_inclusive = this->max_threads_per_block_; @@ -213,13 +223,13 @@ std::vector MultiLevelTilingNode::AddReadReuse(State state) const { return {std::move(state)}; } ICHECK(config.req != ReuseType::kMayReuse); - const BlockRV& block_rv = state.block_rv; + const BlockRV& block_rv = state->block_rv; std::vector results; results.reserve(config.levels.size()); for (int level : config.levels) { - Schedule sch = state.sch->Copy(); - sch->Seed(state.sch->ForkSeed()); - const LoopRV& loop_rv = state.tiles[level - 1].back(); + State new_state = state->Copy(); + Schedule& sch = new_state->sch; + const LoopRV& loop_rv = state->tiles[level - 1].back(); // Enumerate all buffers that are read but not written std::vector read_buffer_ndims = tir::GetReadBufferNDims(sch->GetSRef(block_rv)); for (int i = 0, n_reads = read_buffer_ndims.size(); i < n_reads; ++i) { @@ -246,8 +256,6 @@ std::vector MultiLevelTilingNode::AddReadReuse(State state) const { vector_load_len); } } - State new_state = state; - new_state.sch = sch; results.push_back(std::move(new_state)); } return results; diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h index f260c4856e364..05179318d0b38 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.h +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h @@ -81,8 +81,12 @@ struct ReuseConfig { } }; +// Forware declaration +class State; + /*! \brief The state of auto scheduling for the multi-level tiling rule */ -struct State { +class StateNode : public Object { + public: /*! \brief The schedule to date */ tir::Schedule sch; /*! \brief The block to be tiled */ @@ -90,11 +94,22 @@ struct State { /*! \brief The loop tiles */ Array> tiles; + /*! + * \brief Create a copy of the state. The underlying schedule is copied. Schedule rules that + * produce multiple states should use this method to create new states. + */ + virtual State Copy() const; + + static constexpr const char* _type_key = "meta_schedule.State"; + TVM_DECLARE_BASE_OBJECT_INFO(StateNode, Object); +}; + +/*! \brief Managed reference to StateNode */ +class State : public ObjectRef { + public: /*! \brief Default constructor */ - explicit State(tir::Schedule sch, tir::BlockRV block_rv, - Optional write_cache = NullOpt, bool write_cache_is_added = false, - Array> tiles = {}) - : sch(sch), block_rv(block_rv), tiles(tiles) {} + explicit State(tir::Schedule sch, tir::BlockRV block_rv, Array> tiles = {}); + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(State, ObjectRef, StateNode); }; /*! diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc index da3ea2484e6e5..9dd720db4a2d5 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc @@ -45,7 +45,7 @@ class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode { // tile the outerloops. virtual std::vector ApplySubRules(std::vector states) { states = SubRule(std::move(states), [&](State state) { - state.block_rv = TileForIntrin(state.sch, state.block_rv, intrin_name); + state->block_rv = TileForIntrin(state->sch, state->block_rv, intrin_name); return std::vector(1, state); }); return MultiLevelTilingNode::ApplySubRules(states); From 54f8176b4749d53f94edd0812c3f5a1e536fc6f6 Mon Sep 17 00:00:00 2001 From: Florin Blanaru Date: Wed, 29 Jun 2022 05:36:29 +0100 Subject: [PATCH 004/111] [CI] Docs bot now edits previous comments (#11909) This PR improves the docs bot to edit a previous comment instead of making new comments. Fixes #11837 --- tests/python/ci/test_ci.py | 2 +- tests/scripts/git_utils.py | 3 +++ tests/scripts/github_docs_comment.py | 33 ++++++++++++++++++++++++---- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index d8bcad0151558..27297e165fd64 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -41,7 +41,7 @@ def parameterize_named(*values): "https://pr-docs.tlcpack.ai", "SHA", "issues/11594/comments", - "Built docs for commit SHA can be found " + "\n\nBuilt docs for commit SHA can be found " "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).", ) ], diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py index aeaca164c2c2e..7df8c0b93cd97 100644 --- a/tests/scripts/git_utils.py +++ b/tests/scripts/git_utils.py @@ -97,6 +97,9 @@ def _request(self, full_url: str, body: Dict[str, Any], method: str) -> Dict[str def put(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]: return self._request(self.base + url, data, method="PUT") + def patch(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]: + return self._request(self.base + url, data, method="PATCH") + def post(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]: return self._request(self.base + url, data, method="POST") diff --git a/tests/scripts/github_docs_comment.py b/tests/scripts/github_docs_comment.py index 5da32746df3d1..64377b632c48d 100755 --- a/tests/scripts/github_docs_comment.py +++ b/tests/scripts/github_docs_comment.py @@ -25,11 +25,22 @@ from git_utils import git, GitHubRepo, parse_remote from cmd_utils import init_log +DOCS_BOT_MARKER = "\n\n" +GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]" + def build_docs_url(base_url_docs, pr_number, build_number): return f"{base_url_docs}/PR-{str(pr_number)}/{str(build_number)}/docs/index.html" +def get_pr_comments(github, url): + try: + return github.get(url) + except error.HTTPError as e: + logging.exception(f"Failed to retrieve PR comments: {url}: {e}") + return [] + + def get_pr_and_build_numbers(target_url): target_url = target_url[target_url.find("PR-") : len(target_url)] split = target_url.split("/") @@ -38,6 +49,16 @@ def get_pr_and_build_numbers(target_url): return {"pr_number": pr_number, "build_number": build_number} +def search_for_docs_comment(comments): + for comment in comments: + if ( + comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN + and DOCS_BOT_MARKER in comment["body"] + ): + return comment + return None + + if __name__ == "__main__": help = "Add comment with link to docs" parser = argparse.ArgumentParser(description=help) @@ -65,7 +86,7 @@ def get_pr_and_build_numbers(target_url): ) url = f'issues/{pr_and_build["pr_number"]}/comments' - body = f"Built docs for commit {commit_sha} can be found [here]({docs_url})." + body = f"{DOCS_BOT_MARKER}Built docs for commit {commit_sha} can be found [here]({docs_url})." if not args.dry_run: github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) @@ -77,9 +98,13 @@ def get_pr_and_build_numbers(target_url): logging.info(f"Skipping this action for user {author}") sys.exit(0) - try: + pr_comments = get_pr_comments(github, url) + comment = search_for_docs_comment(pr_comments) + + if comment is not None: + comment_url = comment["url"] + github.patch(comment_url, {"body": body}) + else: github.post(url, {"body": body}) - except error.HTTPError as e: - logging.exception(f"Failed to add docs comment {docs_url}: {e}") else: logging.info(f"Dry run, would have posted {url} with data {body}.") From 9ea681f618d3f580659f3a7fae54b5691da55384 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Tue, 28 Jun 2022 23:33:52 -0700 Subject: [PATCH 005/111] [MetaSchedule] Improve Error Message in JSON Database (#11940) --- src/meta_schedule/database/json_database.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index 9bb7ee1027b99..23ecb121f4999 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -194,14 +194,19 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record, support::parallel_for_dynamic( 0, json_objs.size(), num_threads, [&](int thread_id, int task_id) { const ObjectRef& json_obj = json_objs[task_id]; + Workload workload{nullptr}; try { const ArrayNode* arr = json_obj.as(); ICHECK_EQ(arr->size(), 2); - records[task_id] = TuningRecord::FromJSON(arr->at(1), // - workloads[Downcast(arr->at(0))]); + workload = workloads[Downcast(arr->at(0))]; + records[task_id] = TuningRecord::FromJSON(arr->at(1), workload); } catch (std::runtime_error& e) { - LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj - << "\nThe error is: " << e.what(); + LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1) + << " of file " << path_tuning_record << ". The workload is:\n" + << (workload.defined() ? tir::AsTVMScript(workload) : "(null)") + << "\nThe JSONObject of TuningRecrod is:\n" + << json_obj << "\nThe error message is:\n" + << e.what(); } }); for (const TuningRecord& record : records) { From c9d0d253f0f966d490f4a438b2469b593a0a0497 Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Wed, 29 Jun 2022 12:26:44 +0100 Subject: [PATCH 006/111] [microNPU] increase workspace sizes for network tests (#11943) The network tests with striping were reported to be flaky. This commit increases the workspace size to be generous and also repeats the test case to make sure its not flaky. Change-Id: I134f504250c8fa0bbbcf5f673acec7ffa2ec2f55 --- tests/python/contrib/test_ethosu/test_networks.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index 9b09132a9eae8..075565cd92a6c 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -142,8 +142,18 @@ def test_networks_with_usmp_and_cascader_wo_striping(accel_type, model_url, work @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ + # Checks the same test case multiple times to make sure its not flaky ("ethos-u55-256", MOBILENET_V1_URL, 1010000), - ("ethos-u55-256", MOBILENET_V2_URL, 1180000), + ("ethos-u55-256", MOBILENET_V1_URL, 1010000), + ("ethos-u55-256", MOBILENET_V1_URL, 1010000), + ("ethos-u55-256", MOBILENET_V1_URL, 1010000), + ("ethos-u55-256", MOBILENET_V1_URL, 1010000), + # Checks the same test case multiple times to make sure its not flaky + ("ethos-u55-256", MOBILENET_V2_URL, 1400000), + ("ethos-u55-256", MOBILENET_V2_URL, 1400000), + ("ethos-u55-256", MOBILENET_V2_URL, 1400000), + ("ethos-u55-256", MOBILENET_V2_URL, 1400000), + ("ethos-u55-256", MOBILENET_V2_URL, 1400000), ], ) def test_networks_with_usmp_and_cascader_with_striping(accel_type, model_url, workspace_size): From a7b89071c7605afd020f70635beabd78107b483a Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Wed, 29 Jun 2022 12:02:18 -0700 Subject: [PATCH 007/111] [PyTorch][Relay] Add aten::cross_entropy_loss (#11935) * add cross entropy loss * fix cross entropy args * fix typo * add class indices * fix CI * fix naming * fix typo --- python/tvm/relay/frontend/pytorch.py | 29 +++++++++++++++++++ tests/python/frontend/pytorch/test_forward.py | 18 ++++++++++++ 2 files changed, 47 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 4f10130196a42..9558ad1b6ec02 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -867,6 +867,34 @@ def log_sigmoid(self, inputs, input_types): data = inputs[0] return _op.log(_op.tensor.sigmoid(data)) + def cross_entropy_loss_with_logits(self, inputs, input_types): + input = inputs[0] + target = inputs[1] + weights = inputs[2] + reduction = inputs[3] + ignore_index = inputs[4] + label_smoothing = inputs[5] + input_shape = self.infer_shape(input) + target_shape = self.infer_shape(target) + if input_shape != target_shape: + if reduction == 0: + reduction = "none" + elif reduction == 1: + reduction = "mean" + else: + reduction = "sum" + num_class = self.infer_shape(input)[1] + if weights is None: + weights = _op.full(_expr.const(1), (num_class,), dtype=input_types[0]) + return _op.nn.nll_loss( + _op.nn.log_softmax(input), target, weights, reduction, ignore_index + ) + assert reduction == 1, "reduction not supported in cross_entropy_loss" + assert ignore_index == -100, "ignore_index not supported in cross_entropy_loss" + assert label_smoothing == 0.0, "label_smoothing not supported in cross_entropy_loss" + assert weights is None, "weight not supported in cross_entropy_loss" + return _op.nn.cross_entropy_with_logits(_op.nn.log_softmax(input), target) + def hard_sigmoid(self, inputs, input_types): def _relu6(x): return _op.tensor.clip(x, 0.0, 6.0) @@ -3119,6 +3147,7 @@ def create_convert_map(self): "aten::silu": self.silu, "aten::glu": self.glu, "aten::log_sigmoid": self.log_sigmoid, + "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits, "aten::adaptive_avg_pool1d": functools.partial( self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d ), diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index e4cb6354c0173..1bb4517f0198f 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -4083,6 +4083,24 @@ def test_forward_nll_loss(): verify_model(torch.nn.NLLLoss(reduction="none").eval(), input_data=[predictions, targets]) +def test_cross_entropy_loss(): + torch.set_grad_enabled(False) + N, C = 10, 3 + # class indices + predictions = torch.rand((N, C)).float() + targets = torch.randint(0, 3, (N,)) + weights = torch.tensor([1, 2, 3]).float() + verify_model(torch.nn.CrossEntropyLoss().eval(), input_data=[predictions, targets]) + verify_model( + torch.nn.CrossEntropyLoss(weight=weights).eval(), input_data=[predictions, targets] + ) + + # class probabilities + predictions = torch.randn(N, C).float() + targets = torch.randn(N, C) + verify_model(torch.nn.CrossEntropyLoss().eval(), input_data=[predictions, targets]) + + @tvm.testing.uses_gpu def test_forward_flip(): torch.set_grad_enabled(False) From 84c66da617e12c6690de064c09bf242f4a095dfc Mon Sep 17 00:00:00 2001 From: Alexey Gladyshev Date: Wed, 29 Jun 2022 22:02:35 +0300 Subject: [PATCH 008/111] export VirtualMachine for Windows (#11947) --- include/tvm/runtime/vm/vm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h index 139c8ba5fcc8d..e58fe5eeb3ac2 100644 --- a/include/tvm/runtime/vm/vm.h +++ b/include/tvm/runtime/vm/vm.h @@ -145,7 +145,7 @@ struct VMFrame { * multiple threads, or serialize them to disk or over the * wire. */ -class VirtualMachine : public runtime::ModuleNode { +class TVM_DLL VirtualMachine : public runtime::ModuleNode { public: /*! * \brief Get a PackedFunc from module. From b552bcf1d0584a8ba64915be2efa7daf8907b8e3 Mon Sep 17 00:00:00 2001 From: Christian Convey Date: Wed, 29 Jun 2022 17:20:31 -0400 Subject: [PATCH 009/111] [testing][hexagon] Better subproc errors (#11853) When a subprocess completes with a non-zero exit code, include its stdout and stderr text in the Python exception's error message. --- python/tvm/contrib/hexagon/build.py | 33 +++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py index 7e29f645cea57..080b9828777ac 100644 --- a/python/tvm/contrib/hexagon/build.py +++ b/python/tvm/contrib/hexagon/build.py @@ -40,6 +40,19 @@ ANDROID_BASH_FILE_NAME = "android_bash.sh" +def _check_call_verbose(cmd, **kwargs) -> None: + """ + Similar to subprocess.check_call(cmd), but if the exit code is non-zero + then the raised Exception's message provides more detail, including + the stdout/stderr provided by the subprocess. + """ + try: + subprocess.run(cmd, capture_output=True, check=True, text=True, **kwargs) + except Exception as err: + error_msg = f"{err}\nstdout:\n{err.stdout}\nstderr:\n{err.stderr}" + raise Exception(error_msg) + + def _get_hexagon_rpc_lib_dir() -> pathlib.Path: """Find the Hexagon API binaries. @@ -356,13 +369,11 @@ def _copy_to_remote( self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path] ): """Abstract method implementation. See description in HexagonLauncherRPC.""" - subprocess.check_call( - self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)] - ) + _check_call_verbose(self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)]) def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path: """Abstract method implementation. See description in HexagonLauncherRPC.""" - subprocess.check_call(self._adb_device_sub_cmd + ["shell", "mkdir", "-p", str(remote_path)]) + _check_call_verbose(self._adb_device_sub_cmd + ["shell", "mkdir", "-p", str(remote_path)]) return pathlib.Path(remote_path) def _copy_binaries(self): @@ -418,14 +429,14 @@ def _forward_ports(self, rpc_server_port, existing_forwards): port = rpc_server_port while len(self.forwarded_ports_) < 10: if port not in existing_forwards and not _is_port_in_use(port): - subprocess.check_call( + _check_call_verbose( self._adb_device_sub_cmd + ["forward", f"tcp:{port}", f"tcp:{port}"] ) self.forwarded_ports_.append(port) port += 1 def _reverse_ports(self, rpc_tracker_port): - subprocess.check_call( + _check_call_verbose( self._adb_device_sub_cmd + ["reverse", f"tcp:{rpc_tracker_port}", f"tcp:{rpc_tracker_port}"] ) @@ -455,11 +466,11 @@ def _run_server_script(self): def _cleanup_port_forwarding(self): # Removed pre-defined forward/reverse rules rpc_tracker_port = self._rpc_info["rpc_tracker_port"] - subprocess.check_call( + _check_call_verbose( self._adb_device_sub_cmd + ["reverse", "--remove", f"tcp:{rpc_tracker_port}"] ) for port in self.forwarded_ports_: - subprocess.check_call(self._adb_device_sub_cmd + ["forward", "--remove", f"tcp:{port}"]) + _check_call_verbose(self._adb_device_sub_cmd + ["forward", "--remove", f"tcp:{port}"]) def _terminate_remote(self): # Send interupt to main and child processes @@ -519,11 +530,11 @@ def _copy_to_remote( self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path] ): """Abstract method implementation. See description in HexagonLauncherRPC.""" - subprocess.check_call(["cp", str(local_path), str(remote_path)]) + _check_call_verbose(["cp", str(local_path), str(remote_path)]) def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path: """Abstract method implementation. See description in HexagonLauncherRPC.""" - subprocess.check_call(["mkdir", "-p", str(remote_path)]) + _check_call_verbose(["mkdir", "-p", str(remote_path)]) return pathlib.Path(os.path.abspath(remote_path)) def _copy_libcxx(self, dest_dir: Union[str, pathlib.Path]): @@ -547,7 +558,7 @@ def _copy_libcxx(self, dest_dir: Union[str, pathlib.Path]): # links is to save disk space. tar_in = f"tar -cf - -C {lib_dir} " + " ".join(libcxx_files) tar_out = f"tar -xf - -C {str(dest_dir)}" - subprocess.check_call(tar_in + " | " + tar_out, shell=True) + _check_call_verbose(tar_in + " | " + tar_out, shell=True) def start_server(self): """Abstract method implementation. See description in HexagonLauncherRPC.""" From a84c54efe5edf57f6af8324f3608a2f835a07cc9 Mon Sep 17 00:00:00 2001 From: arangasa <76030063+arangasa@users.noreply.github.com> Date: Thu, 30 Jun 2022 02:52:35 +0530 Subject: [PATCH 010/111] [TOPI][Hexagon] Implement Argmax Slice Op (#11847) * [TOPI][Hexagon] Implement Argmax Slice Op * run through black * Address initial review comments * Fix variable names in tests * Fix lint issue Co-authored-by: arangasa (generated by with_the_same_user script) --- python/tvm/topi/hexagon/slice_ops/__init__.py | 1 + python/tvm/topi/hexagon/slice_ops/argmax.py | 46 +++++++ python/tvm/topi/hexagon/utils.py | 7 ++ .../contrib/test_hexagon/infrastructure.py | 8 ++ .../test_hexagon/topi/test_argmax_slice.py | 116 ++++++++++++++++++ 5 files changed, 178 insertions(+) create mode 100644 python/tvm/topi/hexagon/slice_ops/argmax.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_argmax_slice.py diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index 87af3a767c38e..3340f835200b9 100755 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -19,5 +19,6 @@ from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule from .add_subtract_multiply import * +from .argmax import argmax_compute, argmax_schedule from .softmax_slice import * from .clip import * diff --git a/python/tvm/topi/hexagon/slice_ops/argmax.py b/python/tvm/topi/hexagon/slice_ops/argmax.py new file mode 100644 index 0000000000000..4d34cb50a0b09 --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/argmax.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" Hexagon slice argmax compute and schedule""" + +from tvm import tir +from tvm import topi +from ..utils import get_layout_transform_fn + + +def argmax_compute(in_tensor, axis): + out_tensor = topi.argmax(in_tensor, axis) + return out_tensor + + +def argmax_stir_schedule_nhwc(func, in_layout, out_layout): + """Schedule for nhwc argmax""" + sch = tir.Schedule(func, debug_mask="all") + sch.transform_layout("A_red_temp", "A", in_layout) + sch.transform_layout("A_red", "A_red", out_layout) + return sch + + +def argmax_schedule(argmax_func, in_layout_str, out_layout_str): + """Schedule for argmax: top level function""" + if (in_layout_str == "nhwc-8h2w32c2w-2d") and (out_layout_str == "nhw-32h16w-2d"): + fp16_layout_transform = get_layout_transform_fn(in_layout_str) + int32_layout_transform = get_layout_transform_fn(out_layout_str) + tir_s = argmax_stir_schedule_nhwc( + argmax_func, fp16_layout_transform, int32_layout_transform + ) + return tir_s + raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'") diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index 3efc48c4d04fa..95b25cc5a73b5 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -42,6 +42,11 @@ def nhwc_8h2w32c2w_1d(n, h, w, c): return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2] +def nhw_32h16w_2d(n, h, w): + """Return index map for nhw_32h16w 2d layout""" + return [n, h // 32, w // 16, te.AXIS_SEPARATOR, h % 32, w % 16] + + def nhwc_4h4w32c_1d(n, h, w, c): """Return index map for nhwc_4h4232c 1d layout""" return [n, h // 4, w // 4, c // 32, h % 4, w % 4, c % 32] @@ -72,6 +77,8 @@ def get_layout_transform_fn(layout): return n11c_1024c_2d if layout == "n11c-1024c-1d": return n11c_1024c_1d + if layout == "nhw-32h16w-2d": + return nhw_32h16w_2d if layout == "nhwc-4h4w32c-2d": return nhwc_4h4w32c_2d if layout == "nhwc-4h4w32c-1d": diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index 57a9dff8b4246..c1d2b4046372f 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -247,4 +247,12 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): return arr_np.reshape([n, 1, 1, c // 1024, 1024]) raise RuntimeError(f"Unexpected new_layout '{new_layout}'") + + if current_layout == "nhw": + if new_layout in ["nhw-32h16w-2d"]: + n, h, w = arr_np.shape + return arr_np.reshape([n, h // 32, 32, w // 16, 16]).transpose(0, 1, 3, 2, 4) + + raise RuntimeError(f"Unexpected new_layout '{new_layout}'") + raise RuntimeError(f"Unexpected current_layout '{current_layout}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py new file mode 100644 index 0000000000000..4cbd524f4abfc --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" Tests for Hexagon slice argmax op """ +import numpy as np + +import tvm +import tvm.testing +from tvm import te +import tvm.topi.hexagon.slice_ops as sl +import tvm.contrib.hexagon +from ..infrastructure import allocate_hexagon_array, transform_numpy + + +class TestArgMaxSlice: + """Argmax Slice Op Tests""" + + ( + input_shape, + input_layout, + output_layout, + in_axis, + in_axis_sep, + out_axis_sep, + ) = tvm.testing.parameters( + ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]), + ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]), + ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]), + ) + dtype = tvm.testing.parameter("float16") + working_scope = tvm.testing.parameter("global.vtcm") + + @tvm.testing.fixture + def input_np(self, input_shape, dtype): + return np.random.uniform(size=input_shape).astype(dtype) + + @tvm.testing.fixture + def transformed_input_np(self, input_np, input_layout): + return transform_numpy(input_np, "nhwc", input_layout) + + @tvm.testing.fixture + def expected_output_np(self, input_np, in_axis): + ref_np = np.argmax(input_np, *in_axis).astype("int32") + return ref_np + + @tvm.testing.fixture + def transformed_expected_output_np(self, expected_output_np, output_layout): + return transform_numpy(expected_output_np, "nhw", output_layout) + + @tvm.testing.requires_hexagon + def test_argmax_slice( + self, + input_shape, + dtype, + input_layout, + output_layout, + in_axis, + transformed_input_np, + transformed_expected_output_np, + in_axis_sep, + out_axis_sep, + hexagon_session, + working_scope, + ): + """Top level testing function for argmax""" + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + argmax_input = te.placeholder(input_shape, name="A", dtype=dtype) + output = sl.argmax.argmax_compute(argmax_input, in_axis) + argmax_func = te.create_prim_func([argmax_input, output]) + tir_s = sl.argmax_schedule(argmax_func, input_layout, output_layout) + input_data = allocate_hexagon_array( + hexagon_session.device, + data=transformed_input_np, + axis_separators=in_axis_sep, + mem_scope=working_scope, + ) + output_data = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=transformed_expected_output_np.shape, + dtype=transformed_expected_output_np.dtype, + axis_separators=out_axis_sep, + mem_scope=working_scope, + ) + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}): + tir_irm = tvm.lower(tir_s.mod, [argmax_input, output], name="argmax") + runtime_module = tvm.build( + tir_irm, [argmax_input, output], target=target, name="argmax" + ) + mod = hexagon_session.load_module(runtime_module) + + mod(input_data, output_data) + output_np = output_data.numpy() + tvm.testing.assert_allclose( + output_np, + transformed_expected_output_np, + 1e-3, + 1e-3, + ) + + +if __name__ == "__main__": + tvm.testing.main() From da2ad2d44b9c35fad46883de391a6f45c9ca2efc Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Wed, 29 Jun 2022 15:07:38 -0700 Subject: [PATCH 011/111] [RPC] Add Data & Time For RPC Tracker / Server Logging (#11950) --- python/tvm/rpc/server.py | 9 +++++++++ python/tvm/rpc/tracker.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py index aa8e04248b060..a1a8d8de92880 100644 --- a/python/tvm/rpc/server.py +++ b/python/tvm/rpc/server.py @@ -49,6 +49,15 @@ from .base import TrackerCode logger = logging.getLogger("RPCServer") +console_handler = logging.StreamHandler() +console_handler.setFormatter( + logging.Formatter( + fmt="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) +) +logger.addHandler(console_handler) +logger.setLevel(logging.INFO) +logger.propagate = False def _server_env(load_library, work_path=None): diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py index 5a576a705e8a5..5440addac0239 100644 --- a/python/tvm/rpc/tracker.py +++ b/python/tvm/rpc/tracker.py @@ -64,6 +64,15 @@ from .base import RPC_TRACKER_MAGIC, TrackerCode logger = logging.getLogger("RPCTracker") +console_handler = logging.StreamHandler() +console_handler.setFormatter( + logging.Formatter( + fmt="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S" + ) +) +logger.addHandler(console_handler) +logger.setLevel(logging.INFO) +logger.propagate = False class Scheduler(object): From 7ef6811000a3752e843b333fd5743ea9ef6c049e Mon Sep 17 00:00:00 2001 From: Andrey Malyshev Date: Thu, 30 Jun 2022 02:24:10 +0400 Subject: [PATCH 012/111] [Relay] Handle memory scope during lowering from relay level (#11874) Relay expressions can have assigned virtual devices with certain memory scope. This change landing of memory scope information from Relay level to tir --- include/tvm/driver/driver_api.h | 1 - include/tvm/relay/expr.h | 3 ++- include/tvm/tir/buffer.h | 17 +++++++++++++++ src/driver/driver_api.cc | 30 ++------------------------ src/relay/backend/te_compiler.cc | 30 +++++++++++++++++++++++++- src/relay/backend/te_compiler_cache.cc | 3 ++- src/relay/backend/te_compiler_cache.h | 7 +++++- src/tir/ir/buffer.cc | 27 +++++++++++++++++++++++ 8 files changed, 85 insertions(+), 33 deletions(-) diff --git a/include/tvm/driver/driver_api.h b/include/tvm/driver/driver_api.h index 45a938247cc82..48800b193cb4d 100644 --- a/include/tvm/driver/driver_api.h +++ b/include/tvm/driver/driver_api.h @@ -165,7 +165,6 @@ TVM_DLL runtime::Module build(const Map& input, const Target& * \return The built module that contains code for different processors. */ TVM_DLL runtime::Module build(const Map& input, const Target& target_host); - } // namespace tvm #endif // TVM_DRIVER_DRIVER_API_H_ diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h index 6b014c8478d8a..bd094a7f69055 100644 --- a/include/tvm/relay/expr.h +++ b/include/tvm/relay/expr.h @@ -218,7 +218,8 @@ class VarNode : public ExprNode { bool SEqualReduce(const VarNode* other, SEqualReducer equal) const { equal->MarkGraphNode(); - return equal(type_annotation, other->type_annotation) && equal(vid, other->vid); + return equal(type_annotation, other->type_annotation) && equal(vid, other->vid) && + equal(virtual_device_, other->virtual_device_); } void SHashReduce(SHashReducer hash_reduce) const { diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h index ca7faf1cdefb8..d7a2aec0b9725 100644 --- a/include/tvm/tir/buffer.h +++ b/include/tvm/tir/buffer.h @@ -295,6 +295,23 @@ class DataProducer : public ObjectRef { TVM_DEFINE_OBJECT_REF_METHODS(DataProducer, ObjectRef, DataProducerNode); }; +/*! + * \brief Creates TIR Buffer for provided parameters + * \param shape shape of the buffer + * \param dtype data type + * \param name buffer name + * \param data_alignment alignment requirement of data pointer in bytes + * \param offset_factor Factor of elem_offset field, elem_offset is guaranteed to be + * multiple of offset_factor + User can specify data_alignment and offset_factor to be 0 + * A default value will be picked. + * \param compact If the statement has already bound to a compact buffer. + * \param memory_scope memory scope of the buffer + */ +TVM_DLL tir::Buffer BufferWithOffsetAlignment(Array shape, DataType dtype, + std::string name, int data_alignment, + int offset_factor, bool compact, + std::string memory_scope = ""); } // namespace tir } // namespace tvm #endif // TVM_TIR_BUFFER_H_ diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 7f015e7ca2b90..0446347eca2c1 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -83,32 +83,6 @@ Target DefaultTargetHost(Target target) { } } -tir::Buffer BufferWithOffsetAlignment(Array shape, DataType dtype, std::string name, - int data_alignment, int offset_factor, bool compact) { - DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype); - auto data = tir::Var(name, PointerType(PrimType(storage_dtype))); - bool has_any = false; - if (!compact) { - for (const auto& it : shape) { - if (it.as()) { - has_any = true; - break; - } - } - } - tir::BufferType buffer_type = has_any ? tir::kAutoBroadcast : tir::kDefault; - - PrimExpr elem_offset; - if (offset_factor != 0) { - elem_offset = tir::Var(name + "_elem_offset", shape[0].dtype()); - } else { - elem_offset = PrimExpr(); - } - - return tir::Buffer(data, dtype, shape, Array(), elem_offset, name, data_alignment, - offset_factor, buffer_type); -} - void GetBinds(const Array& args, bool compact, const std::unordered_map& binds, Map* out_binds, Array* out_arg_list) { @@ -118,8 +92,8 @@ void GetBinds(const Array& args, bool compact, if (const te::TensorNode* tensor_node = x.as()) { te::Tensor x_ref = GetRef(tensor_node); if (out_binds->find(x_ref) == out_binds->end()) { - tir::Buffer buf = - BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, x_ref->op->name, -1, 0, compact); + tir::Buffer buf = tir::BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, + x_ref->op->name, -1, 0, compact); out_binds->Set(x_ref, buf); out_arg_list->push_back(buf); } else { diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index e9491b0a89010..08fa18b61e164 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -414,6 +414,33 @@ class TECompilerImpl : public TECompilerNode { } // lower the function std::unordered_map binds; + + // If we have memory scopes, need to create tir::Buffer knowing this info + size_t i = 0; // for corresponding from tensor array + for (Var param : key->source_func->params) { + if (!param->virtual_device()->memory_scope.empty()) { + for (const auto& ttype : FlattenTupleType(param->checked_type())) { + te::Tensor x_ref = value->cached_func->inputs[i]; + // verification if we have synced params and tensors + ICHECK(ttype->dtype == x_ref->dtype && ttype->shape.size() == x_ref->shape.size()) + << "function parameter does not correspond to prepared tensor"; + binds[x_ref] = + tir::BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, x_ref->op->name, -1, 0, + false, param->virtual_device()->memory_scope); + } + } + i++; + } + if (key->virtual_device != VirtualDevice::FullyUnconstrained() && + !key->virtual_device->memory_scope.empty() && + key->virtual_device->memory_scope != "global") { + ICHECK(value->cached_func->outputs.size() == 1) + << "Expect only one output for defined memory scope"; + te::Tensor x_ref = value->cached_func->outputs[0]; + binds[x_ref] = + tir::BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, x_ref->op->name, -1, 0, + false, key->virtual_device->memory_scope); + } auto func_name = value->cached_func->prim_fn_var->name_hint; VLOG(1) << "scheduling"; IRModule scheduled_module = @@ -895,7 +922,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator { } else { // Cases 1 and 2: lower the primitive function for the desired target, possibly using external // codegen. - CCacheKey key(Downcast(primitive_func), target); + CCacheKey key(Downcast(primitive_func), target, + GetVirtualDevice(GetRef(call_node))); CachedFunc cfunc = compiler_->Lower(key, module_name_); ICHECK(cfunc.defined()); return MakeLoweredCall(primitive_func, cfunc->prim_fn_var, std::move(new_args), diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index 0f519721b0b55..bfb351f82b783 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -79,10 +79,11 @@ LoweredOutput::LoweredOutput(tvm::Array outputs, OpImplementation im data_ = std::move(n); } -CCacheKey::CCacheKey(Function source_func, Target target) { +CCacheKey::CCacheKey(Function source_func, Target target, VirtualDevice vd) { auto n = make_object(); n->source_func = std::move(source_func); n->target = std::move(target); + n->virtual_device = std::move(vd); data_ = std::move(n); } diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h index 55f221ac8ba02..ac2619826019c 100644 --- a/src/relay/backend/te_compiler_cache.h +++ b/src/relay/backend/te_compiler_cache.h @@ -82,10 +82,13 @@ class CCacheKeyNode : public Object { Function source_func; /*! \brief The hardware target.*/ Target target; + /*! \brief The virtual device constrains.*/ + VirtualDevice virtual_device; void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("source_func", &source_func); v->Visit("target", &target); + v->Visit("virtual_device", &virtual_device); } /*! \return The hash value of CCacheKey. */ inline size_t Hash() const; @@ -117,7 +120,8 @@ class CCacheKey : public ObjectRef { * \param source_func The source function. * \param target The target device. */ - TVM_DLL CCacheKey(Function source_func, Target target); + TVM_DLL CCacheKey(Function source_func, Target target, + VirtualDevice virtual_device = VirtualDevice::FullyUnconstrained()); const CCacheKeyNode* operator->() const { return static_cast(get()); } // comparator @@ -244,6 +248,7 @@ inline size_t CCacheKeyNode::Hash() const { inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const { if (Hash() != other->Hash()) return false; return this->target->str() == other->target->str() && + this->virtual_device == other->virtual_device && tvm::StructuralEqual()(this->source_func, other->source_func); } diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index dffb8b4992851..1ac0f1f1705ee 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -585,6 +585,33 @@ Buffer::Buffer(Var data, DataType dtype, Array shape, Array data_ = std::move(n); } +tir::Buffer BufferWithOffsetAlignment(Array shape, DataType dtype, std::string name, + int data_alignment, int offset_factor, bool compact, + std::string memory_scope) { + DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype); + auto data = tir::Var(name, PointerType(PrimType(storage_dtype), memory_scope)); + bool has_any = false; + if (!compact) { + for (const auto& it : shape) { + if (it.as()) { + has_any = true; + break; + } + } + } + tir::BufferType buffer_type = has_any ? tir::kAutoBroadcast : tir::kDefault; + + PrimExpr elem_offset; + if (offset_factor != 0) { + elem_offset = tir::Var(name + "_elem_offset", shape[0].dtype()); + } else { + elem_offset = PrimExpr(); + } + + return tir::Buffer(data, dtype, shape, Array(), elem_offset, name, data_alignment, + offset_factor, buffer_type); +} + TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { auto* op = static_cast(node.get()); From 41c94b27ef5f10ad70af211dd25c4837dad53f64 Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Wed, 29 Jun 2022 16:53:36 -0700 Subject: [PATCH 013/111] [Relay][Pytorch] Add aten::new_ones, aten::new_full, aten::fill_, aten::pad, aten::reshape_as and atem::empty_like (#11896) * add new ops * fix pad * fix pad * remove pad * fix CI * remove doc * fix fill_ * add tests --- python/tvm/relay/frontend/pytorch.py | 55 ++++++++++++++ tests/python/frontend/pytorch/test_forward.py | 75 +++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 9558ad1b6ec02..6fe8c89e3c2df 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -701,6 +701,21 @@ def ones_like(self, inputs, input_types): return out + def new_ones(self, inputs, input_types): + size = inputs[1] + + import torch + + if not isinstance(size, (_expr.Expr, list, tuple, torch.Size, np.ndarray)): + msg = "Data type %s could not be parsed in ones op" % (type(size)) + raise AssertionError(msg) + + if inputs[2] is not None: + dtype = _convert_dtype_value(inputs[2]) + else: + dtype = input_types[0] + return self.full_impl(size, 1, dtype) + def zeros(self, inputs, input_types): data = inputs[0] @@ -765,6 +780,28 @@ def full_like(self, inputs, input_types): return out + def new_full(self, inputs, input_types): + data = inputs[1] + fill_value = inputs[2] + import torch + + if not isinstance(data, (_expr.Expr, list, tuple, torch.Size)): + msg = "Data type %s could not be parsed in full op" % (type(data)) + raise AssertionError(msg) + + if inputs[3] is not None: # dtype given + dtype = _convert_dtype_value(inputs[3]) + else: + # if dtype is None, use the dtype of the input tensor + dtype = self.infer_type(input[0]) + + return self.full_impl(data, fill_value, dtype) + + def fill_(self, inputs, input_types): + data = inputs[0] + fill_value = inputs[1] + return self.full_impl(self.infer_shape(data), fill_value, input_types[0]) + def linspace(self, inputs, input_types): start = inputs[0] stop = inputs[1] @@ -1425,6 +1462,11 @@ def reshape(self, inputs, input_types): new_shape = tmp_shape return _op.transform.reshape(data, new_shape) + def reshape_as(self, inputs, input_types): + data = inputs[0] + new_shape = self.infer_shape(inputs[1]) + return _op.transform.reshape(data, new_shape) + def pixel_shuffle(self, inputs, input_types): data = inputs[0] upscale_factor = inputs[1] @@ -2400,6 +2442,14 @@ def empty(self, inputs, input_types): shape = inputs[0] return _op.zeros(shape, _convert_dtype_value(inputs[1])) + def empty_like(self, inputs, input_types): + shape = self.infer_shape(inputs[0]) + if inputs[1] is not None: + dtype = _convert_dtype_value(inputs[1]) + else: + dtype = input_types[0] + return _op.zeros(shape, dtype) + def bincount(self, inputs, input_types): data = inputs[0] weights = inputs[1] @@ -3119,8 +3169,11 @@ def create_convert_map(self): "aten::ones_like": self.ones_like, "aten::zeros": self.zeros, "aten::zeros_like": self.zeros_like, + "aten::new_ones": self.new_ones, "aten::full": self.full, "aten::full_like": self.full_like, + "aten::new_full": self.new_full, + "aten::fill_": self.fill_, "aten::linspace": self.linspace, "aten::reciprocal": self.reciprocal, "aten::repeat": self.repeat, @@ -3186,6 +3239,7 @@ def create_convert_map(self): "aten::size": self.size, "aten::view": self.view, "aten::reshape": self.reshape, + "aten::reshape_as": self.reshape_as, "aten::clone": self.clone, "aten::log_softmax": self.log_softmax, "aten::sigmoid": self.sigmoid, @@ -3305,6 +3359,7 @@ def create_convert_map(self): "aten::tensor": self.identity, # used for example in tensor(1.0) "aten::numel": self.numel, "aten::empty": self.empty, + "aten::empty_like": self.empty_like, "aten::bincount": self.bincount, "aten::scatter_add": self.scatter_add, "aten::__not__": self.logical_not, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 1bb4517f0198f..f039a00f5d91d 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -199,6 +199,28 @@ def visit(op): torch.cuda.empty_cache() +def verify_model_with_input(test_func, input_data, input_dict={}): + baseline_outputs = test_func(*input_data) + trace = torch.jit.trace(test_func, [input.clone() for input in input_data]) + input_names = ["input{}".format(idx) for idx, inp in enumerate(input_data)] + input_shapes = list(zip(input_names, [inp.shape for inp in input_data])) + mod, params = relay.frontend.from_pytorch(trace, input_shapes, {}) + with tvm.transform.PassContext(opt_level=3): + for target in ["llvm", "cuda"]: + if not tvm.runtime.enabled(target): + continue + dev = tvm.device(target, 0) + lib = relay.build(mod, target=target, params=params) + relay_model = graph_executor.GraphModule(lib["default"](dev)) + for name, value in input_dict.items(): + relay_model.set_input(name, value) + relay_model.run() + + compiled_output = relay_model.get_output(0).numpy() + assert_shapes_match(baseline_outputs, compiled_output) + tvm.testing.assert_allclose(baseline_outputs, compiled_output, rtol=1e-5, atol=1e-5) + + # Single operator tests @tvm.testing.uses_gpu def test_forward_pixel_shuffle(): @@ -1275,6 +1297,16 @@ def forward(self, x): verify_model(Reshape3(), input_data=torch.randn(2, 3, 4)) +@tvm.testing.uses_gpu +def test_forward_reshape_as(): + def test_func(input_tensor, other_tensor): + return input_tensor.reshape_as(other_tensor) + + input_data = [torch.rand([2, 1, 10, 1, 10]), torch.rand([2, 1, 10, 10])] + + verify_model_with_input(test_func, input_data, {"input0": input_data[0]}) + + @tvm.testing.uses_gpu def test_flatten(): def _test_flatten(start_dim, end_dim): @@ -2961,6 +2993,17 @@ def forward(self, *args): verify_model(OnesLike3().float().eval(), input_data=input_data) +@tvm.testing.uses_gpu +def test_forward_new_ones(): + torch.set_grad_enabled(False) + input_shape = [1, 3, 10, 10] + + def test_func(input_tensor): + return input_tensor.new_ones([3, 10, 10]) + + verify_model_with_input(test_func, [torch.rand(input_shape).float()]) + + @tvm.testing.uses_gpu def test_forward_zeros(): torch.set_grad_enabled(False) @@ -3034,6 +3077,24 @@ def forward(self, *args): verify_model(FullLike3().float().eval(), input_data=input_data) +@tvm.testing.uses_gpu +def test_forward_new_full(): + torch.set_grad_enabled(False) + input_shape = [1, 3, 10, 10] + + def test_func(input_tensor): + return input_tensor.new_full([2, 3], 1) + + verify_model_with_input(test_func, [torch.rand(input_shape).float()]) + + +def test_forward_fill_(): + def test_func(x): + return x.fill_(3) + + verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()]) + + @tvm.testing.uses_gpu def test_forward_linspace(): torch.set_grad_enabled(False) @@ -3752,6 +3813,20 @@ def forward(self, data): verify_script_model(Numel(), [(3, 5, 8)], targets) +def test_empty(): + def test_func(): + return torch.empty([1, 3, 10, 10]) + + verify_model_with_input(test_func, []) + + +def test_empty_like(): + def test_func(data): + return torch.empty_like(data) + + verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()]) + + def test_forward_pretrained_bert_base_uncased(): ###################################################################### # This is an example how to run BERT models using TVM From 3bd83e0c9f2e4c6cc50fd86d5691449417487a93 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 29 Jun 2022 17:03:32 -0700 Subject: [PATCH 014/111] [MetaSchedule] Handle 'warp_execution' implied extend of threadIdx.x in VerifyGpuCode (#11949) --- include/tvm/tir/stmt.h | 6 + src/meta_schedule/postproc/verify_gpu_code.cc | 18 +- ..._meta_schedule_postproc_verify_gpu_code.py | 438 ++++++++++++++++-- 3 files changed, 417 insertions(+), 45 deletions(-) diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index 4c8a3076a20b1..ac35c0b41e0ec 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -1525,6 +1525,12 @@ constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensori /*! \brief Mark that a block is a preprocessor block for layout rewrite. */ constexpr const char* meta_schedule_layout_rewrite_preproc = "meta_schedule.layout_rewrite_preproc"; +/*! + * \brief Mark that a block is executed by a warp. This implies the extend of threadIdx.x is + * warp size. + */ +constexpr const char* warp_execution = "warp_execution"; + /*! * \brief Check if attr_key is a pragma key extension * \param attr_key The attr key to be compared diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc index 674359803880b..57e58e6a79ffb 100644 --- a/src/meta_schedule/postproc/verify_gpu_code.cc +++ b/src/meta_schedule/postproc/verify_gpu_code.cc @@ -25,9 +25,11 @@ namespace tir { class ThreadExtentChecker : private StmtVisitor { public: - static bool Check(const Stmt& stmt) { + static bool Check(const Stmt& stmt, int thread_warp_size) { try { - ThreadExtentChecker().VisitStmt(stmt); + ICHECK(thread_warp_size > 0); + ThreadExtentChecker checker(thread_warp_size); + checker.VisitStmt(stmt); return true; } catch (const dmlc::Error& e) { return false; @@ -35,6 +37,8 @@ class ThreadExtentChecker : private StmtVisitor { } private: + explicit ThreadExtentChecker(int thread_warp_size) : thread_warp_size_(thread_warp_size) {} + void VisitStmt_(const ForNode* loop) { runtime::ThreadScope thread_scope = GetThreadScope(loop); if (IsThreadIdx(thread_scope)) { @@ -64,6 +68,10 @@ class ThreadExtentChecker : private StmtVisitor { } void VisitStmt_(const BlockNode* block) { + int old_thread_idx_x = thread_idx_x; + if (block->annotations.count(attr::warp_execution)) { + thread_idx_x = thread_warp_size_; + } if (Optional low_inclusive = GetAnn(block, attr::meta_schedule_thread_extent_low_inclusive)) { if (Optional high_inclusive = @@ -77,11 +85,13 @@ class ThreadExtentChecker : private StmtVisitor { } } StmtVisitor::VisitStmt_(block); + thread_idx_x = old_thread_idx_x; } int64_t thread_idx_x = 1; int64_t thread_idx_y = 1; int64_t thread_idx_z = 1; + int thread_warp_size_ = -1; }; } // namespace tir @@ -104,6 +114,7 @@ Integer Extract(const Target& target, const char* name) { class VerifyGPUCodeNode : public PostprocNode { public: Map target_constraints_{nullptr}; + int thread_warp_size_ = -1; void InitializeWithTuneContext(const TuneContext& context) final { ICHECK(context->target.defined()); @@ -114,6 +125,7 @@ class VerifyGPUCodeNode : public PostprocNode { {"max_vthread", Integer(8)}, {"max_vector_bytes", Integer(16)}, }; + thread_warp_size_ = Extract(target, "thread_warp_size"); } bool Verify(const IRModule& mod) const { @@ -133,7 +145,7 @@ class VerifyGPUCodeNode : public PostprocNode { const GlobalVar& g_var = kv.first; const BaseFunc& base_func = kv.second; if (const auto* prim_func = base_func.as()) { - if (!tir::ThreadExtentChecker::Check(prim_func->body)) { + if (!tir::ThreadExtentChecker::Check(prim_func->body, thread_warp_size_)) { return false; } IRModule lowered{nullptr}; diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py index aacb889cb5771..0b1e0f402b9d3 100644 --- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py +++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py @@ -393,58 +393,412 @@ def GmmCuda2(X: T.Buffer[(1, 128, 128), "float32"], Y: T.Buffer[(1, 128, 128), " T.writes(Z[v0, v1, v2]) Z[v0, v1, v2] = Z_local[v0, v1, v2] -# fmt: on -# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant - - -def test_postproc_verify_gpu_0(): - mod = Conv2dCuda0 - ctx = _create_context(mod, target=_target()) - sch = tir.Schedule(mod, debug_mask="all") - assert ctx.postprocs[0].apply(sch) - -def test_postproc_verify_gpu_1(): - mod = Conv2dCuda1 - ctx = _create_context(mod, target=_target()) - sch = tir.Schedule(mod, debug_mask="all") - assert ctx.postprocs[0].apply(sch) - - -def test_postproc_verify_gpu_2(): - mod = Conv2dCuda2 - ctx = _create_context(mod, target=_target()) - sch = tir.Schedule(mod, debug_mask="all") - # Should fail due to too much local memory per block (large - # Apad_shared allocation). - assert not ctx.postprocs[0].apply(sch) +@T.prim_func +def GMMCUDATensorCore( + X: T.Buffer[(1024, 1024), "float16"], + Y: T.Buffer[(1024, 1024), "float16"], + Z: T.Buffer[(1024, 1024), "float32"], +) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + s0 = T.var("int32") + s0_1 = T.var("int32") + s0_2 = T.var("int32") + s1 = T.var("int32") + s1_1 = T.var("int32") + s1_2 = T.var("int32") + # body + # with T.block("root") + Z_wmma_accumulator = T.alloc_buffer([1024, 1024], dtype="float32", scope="wmma.accumulator") + X_shared = T.alloc_buffer([1024, 1024], dtype="float16", scope="shared") + Y_shared = T.alloc_buffer([1024, 1024], dtype="float16", scope="shared") + X_shared_wmma_matrix_a = T.alloc_buffer([1024, 1024], dtype="float16", scope="wmma.matrix_a") + Y_shared_wmma_matrix_b = T.alloc_buffer([1024, 1024], dtype="float16", scope="wmma.matrix_b") + for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(64, thread="blockIdx.x"): + for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(2, thread="blockIdx.y"): + for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(2, thread="threadIdx.y"): + for ax1_0_3_init, ax2_0_3_init, ax1_0_4_init, ax2_0_4_init in T.grid(2, 1, 2, 4): + with T.block("Z_o_init"): + v0 = T.axis.spatial(1, 0) + v1_o = T.axis.spatial( + 64, + ax0_0_ax1_0_0_ax2_0_0_fused % 64 // 16 * 16 + + ax0_1_ax1_0_1_ax2_0_1_fused % 2 * 8 + + ax0_2_ax1_0_2_ax2_0_2_fused % 2 * 4 + + ax1_0_3_init * 2 + + ax1_0_4_init, + ) + v2_o = T.axis.spatial( + 64, + (ax0_0_ax1_0_0_ax2_0_0_fused % 16 + 0 + 0 + ax2_0_3_init) * 4 + + ax2_0_4_init, + ) + T.reads() + T.writes( + Z_wmma_accumulator[ + v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ] + ) + T.block_attr( + { + "meta_schedule.thread_extent_high_inclusive": 1024, + "meta_schedule.thread_extent_low_inclusive": 32, + "warp_execution": 1, + } + ) + C = T.match_buffer( + Z_wmma_accumulator[ + v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ], + [16, 16], + dtype="float32", + scope="wmma.accumulator", + offset_factor=16, + ) + T.evaluate( + T.tvm_fill_fragment( + C.data, + 16, + 16, + 16, + C.elem_offset // 256 + C.elem_offset % 256 // 16, + T.float32(0), + dtype="handle", + ) + ) + for ax3_0_0 in T.serial(32): + for ax0_ax1_fused_0 in T.serial(16): + for ax0_ax1_fused_1 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_fused_3 in T.vectorized(4): + with T.block("X_shared"): + v0 = T.axis.spatial( + 1024, + ax0_0_ax1_0_0_ax2_0_0_fused // 16 * 256 + + ax0_1_ax1_0_1_ax2_0_1_fused * 128 + + ( + ax0_ax1_fused_0 * 256 + + ax0_ax1_fused_1 * 128 + + ax0_ax1_fused_2 * 4 + + ax0_ax1_fused_3 + ) + // 32, + ) + v1 = T.axis.spatial( + 1024, + ax3_0_0 * 32 + + ( + ax0_ax1_fused_0 * 256 + + ax0_ax1_fused_1 * 128 + + ax0_ax1_fused_2 * 4 + + ax0_ax1_fused_3 + ) + % 32, + ) + T.reads(X[v0, v1]) + T.writes(X_shared[v0, v1]) + T.block_attr({"buffer_dim_align": [[0, 0, 32, 8]]}) + X_shared[v0, v1] = X[v0, v1] + for ax0_ax1_fused_0 in T.serial(8): + for ax0_ax1_fused_1 in T.thread_binding(2, thread="threadIdx.y"): + for ax0_ax1_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for ax0_ax1_fused_3 in T.vectorized(4): + with T.block("Y_shared"): + v0 = T.axis.spatial( + 1024, + ax3_0_0 * 32 + + ( + ax0_ax1_fused_0 * 256 + + ax0_ax1_fused_1 * 128 + + ax0_ax1_fused_2 * 4 + + ax0_ax1_fused_3 + ) + // 64, + ) + v1 = T.axis.spatial( + 1024, + ax0_0_ax1_0_0_ax2_0_0_fused % 16 * 64 + + ( + ax0_ax1_fused_0 * 256 + + ax0_ax1_fused_1 * 128 + + ax0_ax1_fused_2 * 4 + + ax0_ax1_fused_3 + ) + % 64, + ) + T.reads(Y[v0, v1]) + T.writes(Y_shared[v0, v1]) + T.block_attr({"buffer_dim_align": [[0, 0, 32, 8]]}) + Y_shared[v0, v1] = Y[v0, v1] + for ax3_0_1 in T.serial(2): + for ax0_0, ax1_0 in T.grid(4, 1): + with T.block("X_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial( + 64, + ax0_0_ax1_0_0_ax2_0_0_fused // 16 * 16 + + ax0_1_ax1_0_1_ax2_0_1_fused * 8 + + ax0_2_ax1_0_2_ax2_0_2_fused * 4 + + ax0_0, + ) + v1_o = T.axis.spatial(64, ax3_0_0 * 2 + ax3_0_1) + T.reads( + X_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16] + ) + T.writes( + X_shared_wmma_matrix_a[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ] + ) + A = T.match_buffer( + X_shared[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ], + [16, 16], + dtype="float16", + strides=[s1, s0], + scope="shared", + offset_factor=16, + ) + C_1 = T.match_buffer( + X_shared_wmma_matrix_a[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ], + [16, 16], + dtype="float16", + scope="wmma.matrix_a", + offset_factor=16, + ) + T.evaluate( + T.tvm_load_matrix_sync( + C_1.data, + 16, + 16, + 16, + C_1.elem_offset // 256 + C_1.elem_offset % 256 // 16, + T.tvm_access_ptr( + T.type_annotation(dtype="float16"), + A.data, + A.elem_offset, + s1 * 16, + 1, + dtype="handle", + ), + s1, + "row_major", + dtype="handle", + ) + ) + for ax0_0, ax1_0 in T.grid(1, 4): + with T.block("Y_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(64, ax3_0_0 * 2 + ax3_0_1) + v1_o = T.axis.spatial( + 64, ax0_0_ax1_0_0_ax2_0_0_fused % 16 * 4 + ax1_0 + ) + T.reads( + Y_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16] + ) + T.writes( + Y_shared_wmma_matrix_b[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ] + ) + A_1 = T.match_buffer( + Y_shared[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ], + [16, 16], + dtype="float16", + strides=[s1_1, s0_1], + scope="shared", + offset_factor=16, + ) + C_2 = T.match_buffer( + Y_shared_wmma_matrix_b[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ], + [16, 16], + dtype="float16", + scope="wmma.matrix_b", + offset_factor=16, + ) + T.evaluate( + T.tvm_load_matrix_sync( + C_2.data, + 16, + 16, + 16, + C_2.elem_offset // 256 + C_2.elem_offset % 256 // 16, + T.tvm_access_ptr( + T.type_annotation(dtype="float16"), + A_1.data, + A_1.elem_offset, + s1_1 * 16, + 1, + dtype="handle", + ), + s1_1, + "row_major", + dtype="handle", + ) + ) + for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid( + 1, 2, 1, 1, 1, 2, 4 + ): + with T.block("Z_o_update"): + v0 = T.axis.spatial(1, 0) + v1_o = T.axis.spatial( + 64, + ax0_0_ax1_0_0_ax2_0_0_fused % 64 // 16 * 16 + + ax0_1_ax1_0_1_ax2_0_1_fused % 2 * 8 + + ax0_2_ax1_0_2_ax2_0_2_fused % 2 * 4 + + ax1_0_3 * 2 + + ax1_0_4, + ) + v2_o = T.axis.spatial( + 64, + (ax0_0_ax1_0_0_ax2_0_0_fused % 16 + 0 + 0 + ax2_0_3) * 4 + + ax2_0_4, + ) + v3_o = T.axis.reduce(64, ax3_0_0 * 2 + ax3_0_1 + ax3_0_2) + T.reads( + Z_wmma_accumulator[ + v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ], + X_shared_wmma_matrix_a[ + v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16 + ], + Y_shared_wmma_matrix_b[ + v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ], + ) + T.writes( + Z_wmma_accumulator[ + v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ] + ) + T.block_attr( + { + "meta_schedule.thread_extent_high_inclusive": 1024, + "meta_schedule.thread_extent_low_inclusive": 32, + "warp_execution": 1, + } + ) + A_2 = T.match_buffer( + X_shared_wmma_matrix_a[ + v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16 + ], + [16, 16], + dtype="float16", + scope="wmma.matrix_a", + offset_factor=16, + ) + B = T.match_buffer( + Y_shared_wmma_matrix_b[ + v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ], + [16, 16], + dtype="float16", + scope="wmma.matrix_b", + offset_factor=16, + ) + C_3 = T.match_buffer( + Z_wmma_accumulator[ + v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16 + ], + [16, 16], + dtype="float32", + scope="wmma.accumulator", + offset_factor=16, + ) + T.evaluate( + T.tvm_mma_sync( + C_3.data, + C_3.elem_offset // 256 + C_3.elem_offset % 256 // 16, + A_2.data, + A_2.elem_offset // 256, + B.data, + B.elem_offset // 256, + C_3.data, + C_3.elem_offset // 256 + C_3.elem_offset % 256 // 16, + dtype="handle", + ) + ) + for ax0_0, ax1_0 in T.grid(4, 4): + with T.block("Z_wmma.accumulator_o"): + v0_o = T.axis.spatial( + 64, + ax0_0_ax1_0_0_ax2_0_0_fused // 16 * 16 + + ax0_1_ax1_0_1_ax2_0_1_fused * 8 + + ax0_2_ax1_0_2_ax2_0_2_fused * 4 + + ax0_0, + ) + v1_o = T.axis.spatial(64, ax0_0_ax1_0_0_ax2_0_0_fused % 16 * 4 + ax1_0) + T.reads( + Z_wmma_accumulator[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ] + ) + T.writes(Z[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + A_3 = T.match_buffer( + Z_wmma_accumulator[ + v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16 + ], + [16, 16], + dtype="float32", + scope="wmma.accumulator", + offset_factor=16, + ) + C_4 = T.match_buffer( + Z[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], + [16, 16], + dtype="float32", + strides=[s1_2, s0_2], + offset_factor=16, + ) + T.evaluate( + T.tvm_store_matrix_sync( + A_3.data, + 16, + 16, + 16, + A_3.elem_offset // 256 + A_3.elem_offset % 256 // 16, + T.tvm_access_ptr( + T.type_annotation(dtype="float32"), + C_4.data, + C_4.elem_offset, + s1_2 * 16, + 2, + dtype="handle", + ), + s1_2, + "row_major", + dtype="handle", + ) + ) -def test_postproc_verify_gpu_3(): - mod = Conv2dCuda3 - ctx = _create_context(mod, target=_target()) - sch = tir.Schedule(mod, debug_mask="all") - # Should fail due to too many threads per block (large - # threadIdx.x extent). - assert not ctx.postprocs[0].apply(sch) +# fmt: on +# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant -def test_postproc_verify_gpu_4(): - mod = GmmCuda0 +@pytest.mark.parametrize("mod", [Conv2dCuda0, Conv2dCuda1, GmmCuda0, GMMCUDATensorCore]) +def test_postproc_check_pass(mod): ctx = _create_context(mod, target=_target()) sch = tir.Schedule(mod, debug_mask="all") assert ctx.postprocs[0].apply(sch) -def test_postproc_verify_gpu_5(): - mod = GmmCuda1 - ctx = _create_context(mod, target=_target()) - sch = tir.Schedule(mod, debug_mask="all") - assert not ctx.postprocs[0].apply(sch) - - -def test_postproc_verify_gpu_6(): - mod = GmmCuda2 +@pytest.mark.parametrize( + "mod", + [ + Conv2dCuda2, # Should fail due to too much local memory per block (large Apad_shared allocation) + Conv2dCuda3, # Should fail due to too many threads per block (large threadIdx.x extent) + GmmCuda1, + GmmCuda2, + ], +) +def test_postproc_check_fail(mod): ctx = _create_context(mod, target=_target()) sch = tir.Schedule(mod, debug_mask="all") assert not ctx.postprocs[0].apply(sch) From 898946fec60898b8fa753d6f0cdf8ebc86c9827a Mon Sep 17 00:00:00 2001 From: Altan Haan <3124994+altanh@users.noreply.github.com> Date: Wed, 29 Jun 2022 17:43:48 -0700 Subject: [PATCH 015/111] support any shape and axis for log softmax (#11951) --- python/tvm/topi/nn/softmax.py | 42 ++++++++--- python/tvm/topi/testing/softmax_python.py | 28 +++---- python/tvm/topi/x86/nn.py | 2 +- tests/python/relay/test_op_level1.py | 74 ++++++++++--------- tests/python/topi/python/test_topi_softmax.py | 2 +- 5 files changed, 84 insertions(+), 64 deletions(-) diff --git a/python/tvm/topi/nn/softmax.py b/python/tvm/topi/nn/softmax.py index cb6d5b321eacb..2d6921b26dfad 100644 --- a/python/tvm/topi/nn/softmax.py +++ b/python/tvm/topi/nn/softmax.py @@ -136,16 +136,38 @@ def log_softmax(x, axis=-1): output : tvm.te.Tensor 2-D output with same shape """ - assert len(x.shape) == 2, "only support 2-dim log softmax" - # pylint: disable=R1714 - assert axis == -1 or axis == len(x.shape) - 1, "only support last axis log softmax" - m, n = x.shape - k = te.reduce_axis((0, n), name="k") - max_elem = te.compute((m,), lambda i: tvm.te.max(x[i, k], axis=k)) - k = te.reduce_axis((0, n), name="k") - expsum = te.compute((m,), lambda i: te.sum(te.exp(x[i, k] - max_elem[i]), axis=k)) + shape = x.shape + if axis < 0: + axis = len(shape) + axis + if axis >= len(shape): + ValueError("axis parameter should be less than input dim") + + k1 = te.reduce_axis((0, shape[axis]), name="k") + k2 = te.reduce_axis((0, shape[axis]), name="k") + + def insert_reduce_index(indices, reduce_index): + return indices[:axis] + (reduce_index,) + indices[axis:] + + def get_non_reduce_indices(indices): + return tuple([var for (i, var) in enumerate(indices) if i != axis]) + + def _compute_max(*indices): + eval_range = insert_reduce_index(indices, k1) + return tvm.te.max(x[eval_range], axis=k1) + + def _compute_expsum(max_elem, *indices): + eval_range = insert_reduce_index(indices, k2) + return te.sum(te.exp(x[eval_range] - max_elem[indices]), axis=k2) + + def _normalize(max_elem, expsum, *indices): + non_reduce_indices = get_non_reduce_indices(indices) + return x[indices] - max_elem[non_reduce_indices] - te.log(expsum[non_reduce_indices]) + + reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis]) + max_elem = te.compute(reduced_shape, _compute_max, name="T_softmax_maxelem") + expsum = te.compute(reduced_shape, lambda *indices: _compute_expsum(max_elem, *indices)) return te.compute( - x.shape, - lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]), + shape, + lambda *indices: _normalize(max_elem, expsum, *indices), attrs={"axis": axis}, ) diff --git a/python/tvm/topi/testing/softmax_python.py b/python/tvm/topi/testing/softmax_python.py index da2893d1fa7b8..6be5d48a671a0 100644 --- a/python/tvm/topi/testing/softmax_python.py +++ b/python/tvm/topi/testing/softmax_python.py @@ -19,43 +19,39 @@ import numpy as np -def softmax_python(a_np): +def softmax_python(a_np, axis=1): """Softmax operator. Parameters ---------- a_np : numpy.ndarray - 2-D input data + N-D input data Returns ------- output_np : numpy.ndarray - 2-D output with same shape + N-D output with same shape """ - assert len(a_np.shape) == 2, "only support 2-dim softmax" - max_elem = np.amax(a_np, axis=1) - max_elem = max_elem.reshape(max_elem.shape[0], 1) + max_elem = np.amax(a_np, axis=axis, keepdims=True) e = np.exp(a_np - max_elem) - expsum = np.sum(e, axis=1) - out_np = e / expsum[:, None] + expsum = np.sum(e, axis=axis, keepdims=True) + out_np = e / expsum return out_np -def log_softmax_python(a_np): +def log_softmax_python(a_np, axis=1): """Log_softmax operator. Parameters ---------- a_np : numpy.ndarray - 2-D input data + N-D input data Returns ------- output_np : numpy.ndarray - 2-D output with same shape + N-D output with same shape """ - assert len(a_np.shape) == 2, "only support 2-dim log_softmax" - max_elem = np.amax(a_np, axis=1) - max_elem = max_elem.reshape(max_elem.shape[0], 1) + max_elem = np.amax(a_np, axis=axis, keepdims=True) e = np.exp(a_np - max_elem) - expsum = np.sum(e, axis=1) - out_np = a_np - max_elem - np.log(expsum[:, None]) + expsum = np.sum(e, axis=axis, keepdims=True) + out_np = a_np - max_elem - np.log(expsum) return out_np diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py index 9b6754c5e8472..5475fc772e77c 100644 --- a/python/tvm/topi/x86/nn.py +++ b/python/tvm/topi/x86/nn.py @@ -39,7 +39,7 @@ def _schedule_softmax(softmax_op, s, outs): delta = None max_elem = softmax_op.input_tensors[1] expsum = softmax_op.input_tensors[2] - axis = 1 + axis = int(softmax_op.attrs["axis"]) else: raise ValueError( "Tag is expected to be softmax_output or log_softmax_output. \ diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index 44df40d3b0bd4..4ce422ae8893e 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -249,46 +249,48 @@ def test_expand_dims_infer_type(): @tvm.testing.uses_gpu def test_softmax(): - for dtype in ["float16", "float32"]: - # Softmax accuracy for float16 is poor - if dtype == "float16": - return - shape = (10, 4) - x = relay.var("x", shape=shape, dtype=dtype) - y = relay.nn.softmax(x, axis=1) - assert "nn.softmax" in y.astext() - yy = run_infer_type(y) - assert yy.checked_type == relay.TensorType(shape, dtype) - func = relay.Function([x], y) - x_data = np.random.uniform(size=shape).astype(dtype) - ref_res = tvm.topi.testing.softmax_python(x_data) - for target, dev in tvm.testing.enabled_targets(): - op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - x_data - ) - np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + for shape in [(10, 4), (10, 5, 4)]: + for dtype in ["float16", "float32"]: + # Softmax accuracy for float16 is poor + if dtype == "float16": + continue + x = relay.var("x", shape=shape, dtype=dtype) + y = relay.nn.softmax(x, axis=1) + assert "nn.softmax" in y.astext() + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType(shape, dtype) + func = relay.Function([x], y) + x_data = np.random.uniform(size=shape).astype(dtype) + ref_res = tvm.topi.testing.softmax_python(x_data, axis=1) + for target, dev in tvm.testing.enabled_targets(): + op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)( + x_data + ) + np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu def test_log_softmax(): - for dtype in ["float16", "float32"]: - # Softmax accuracy for float16 is poor - if dtype == "float16": - return - shape = (10, 4) - x = relay.var("x", shape=shape, dtype=dtype) - y = relay.nn.log_softmax(x, axis=1) - assert "nn.log_softmax" in y.astext() - yy = run_infer_type(y) - assert yy.checked_type == relay.TensorType(shape, dtype) - func = relay.Function([x], y) - x_data = np.random.uniform(size=shape).astype(dtype) - ref_res = tvm.topi.testing.log_softmax_python(x_data) - for target, dev in tvm.testing.enabled_targets(): - op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)( - x_data - ) - np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) + for shape in [(10, 4), (10, 5, 4)]: + for dtype in ["float16", "float32"]: + # Softmax accuracy for float16 is poor + if dtype == "float16": + continue + x = relay.var("x", shape=shape, dtype=dtype) + y = relay.nn.log_softmax(x, axis=1) + assert "nn.log_softmax" in y.astext() + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType(shape, dtype) + func = relay.Function([x], y) + x_data = np.random.uniform(size=shape).astype(dtype) + ref_res = tvm.topi.testing.log_softmax_python(x_data, axis=1) + for target, dev in tvm.testing.enabled_targets(): + if target == "nvptx": + continue + op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)( + x_data + ) + np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py index 8243211a86746..8e5e039b1448d 100644 --- a/tests/python/topi/python/test_topi_softmax.py +++ b/tests/python/topi/python/test_topi_softmax.py @@ -50,7 +50,7 @@ "log_softmax": { "topi": topi.nn.log_softmax, "ref": tvm.topi.testing.log_softmax_python, - "dimensions": [2], + "dimensions": [2, 3], "axis": [1], }, } From 558ba99c7cad6fa5f01cfdb2bd6bdd2cec6087db Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Wed, 29 Jun 2022 21:11:41 -0700 Subject: [PATCH 016/111] [MetaSchedule] Tuning Script Upgrade (#11797) * Support uint8. * Modify tuning functions. * Follow legacy setting, use int32 for uint8. * Add vm support. * Fix vm usage. * Use vm in rpc run module. * Fix lint & stuff. * Fix backend. * Fix ftimer. * Fix lint. * Limit backend choice. * Add try catch. * Display name in rpc try catch. * Support ahb from tune_relay. * Modify scripts. * Fix typo. * Minor fix. * Fix try catch & func name. * Fix utils. * Move utils to tune_utils. * Fix tune_utils. --- .../tvm/auto_scheduler/testing/tune_onnx.py | 150 ++++++-------- .../tvm/auto_scheduler/testing/tune_relay.py | 145 +++++-------- python/tvm/auto_scheduler/testing/tune_te.py | 97 +++++---- .../meta_schedule/cost_model/cost_model.py | 2 +- .../testing/custom_builder_runner.py | 14 +- python/tvm/meta_schedule/testing/tune_onnx.py | 86 +++----- .../tvm/meta_schedule/testing/tune_relay.py | 84 +++----- python/tvm/meta_schedule/testing/tune_te.py | 16 +- .../tvm/meta_schedule/testing/tune_utils.py | 194 ++++++++++++++++++ python/tvm/meta_schedule/testing/utils.py | 3 +- python/tvm/meta_schedule/tune.py | 20 +- 11 files changed, 448 insertions(+), 363 deletions(-) create mode 100644 python/tvm/meta_schedule/testing/tune_utils.py diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py index 5444794cf1aa3..a3299c05bb821 100644 --- a/python/tvm/auto_scheduler/testing/tune_onnx.py +++ b/python/tvm/auto_scheduler/testing/tune_onnx.py @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import os - -from distutils.util import strtobool -import numpy as np # type: ignore import onnx # type: ignore + import tvm from tvm import auto_scheduler from tvm import meta_schedule as ms from tvm import relay from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc +from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer from tvm.meta_schedule.utils import cpu_count from tvm.relay.frontend import from_onnx from tvm.support import describe @@ -96,17 +96,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -135,6 +141,7 @@ def main(): repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, + timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": @@ -163,102 +170,63 @@ def main(): onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: - print(f" input_name: {item['name']}") + print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], - params, - target=ARGS.target, - hardware_params=hardware_params, - ) - for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") - print(task.compute_dag) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - runner=runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], - ), - adaptive_training=ARGS.adaptive_training, - ) - - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay.build( - mod, - target=ARGS.target, - params=params, + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } + + with ms.Profiler() as profiler: + tasks, task_weights = auto_scheduler.extract_tasks( + mod["main"], + params, + target=ARGS.target, + hardware_params=hardware_params, + ) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + print( + f"==== Task {idx}: {task.desc} " + f"(weight {task_weight} key: {task.workload_key}) =====" ) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - input_data = {} - for item in ARGS.input_shape: - input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"] - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype + print(task.compute_dag) + + if ARGS.num_trials > 0: + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + runner=runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ), + adaptive_training=ARGS.adaptive_training, ) - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend] + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay_build( + mod, + target=ARGS.target, + params=params, + ) + print("Tuning Time:") + print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py index fedb27281a44e..fe747af7972cb 100644 --- a/python/tvm/auto_scheduler/testing/tune_relay.py +++ b/python/tvm/auto_scheduler/testing/tune_relay.py @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import os -from distutils.util import strtobool -import numpy as np # type: ignore import tvm from tvm import auto_scheduler from tvm import meta_schedule as ms from tvm import relay from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network +from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer from tvm.meta_schedule.utils import cpu_count from tvm.support import describe @@ -94,17 +94,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -133,6 +139,7 @@ def main(): repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, + timeout=ARGS.rpc_config.session_timeout_sec, ) if ARGS.target.kind.name == "llvm": @@ -164,100 +171,62 @@ def main(): cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} - input_data = {} + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } for input_name, input_shape in input_info.items(): - print(f" input_name: {input_name}") + print(f" input_name : {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], - params, - target=ARGS.target, - hardware_params=hardware_params, - ) - for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====") - print(task.compute_dag) - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - runner=runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], - ), - adaptive_training=ARGS.adaptive_training, - ) - - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay.build( - mod, - target=ARGS.target, - params=params, + with ms.Profiler() as profiler: + tasks, task_weights = auto_scheduler.extract_tasks( + mod["main"], + params, + target=ARGS.target, + hardware_params=hardware_params, + ) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + print( + f"==== Task {idx}: {task.desc} " + f"(weight {task_weight} key: {task.workload_key}) =====" ) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - for input_name, input_shape in input_info.items(): - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype + print(task.compute_dag) + + if ARGS.num_trials > 0: + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + runner=runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ), + adaptive_training=ARGS.adaptive_training, ) - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend] + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay_build( + mod, + target=ARGS.target, + params=params, + ) + print("Tuning Time:") + print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py index c6a5ab27cfd88..da3584512dd02 100644 --- a/python/tvm/auto_scheduler/testing/tune_te.py +++ b/python/tvm/auto_scheduler/testing/tune_te.py @@ -15,12 +15,13 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import os -from distutils.util import strtobool import tvm from tvm import auto_scheduler +from tvm import meta_schedule as ms from tvm.meta_schedule.testing.te_workload import CONFIGS from tvm.meta_schedule.utils import cpu_count from tvm.support import describe @@ -79,20 +80,26 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, + required=False, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) + parsed.rpc_config = ms.runner.RPCConfig( + tracker_host=parsed.rpc_host, + tracker_port=parsed.rpc_port, + tracker_key=parsed.rpc_key, + session_timeout_sec=60, + ) return parsed @@ -100,12 +107,19 @@ def _parse_args(): def main(): - describe() - print(f"Workload: {ARGS.workload}") log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") - workload_func, params = CONFIGS[ARGS.workload] - params = params[0] # type: ignore - workload_func = auto_scheduler.register_workload(workload_func) + + runner = auto_scheduler.RPCRunner( + key=ARGS.rpc_key, + host=ARGS.rpc_host, + port=ARGS.rpc_port, + n_parallel=cpu_count(logical=True), + number=ARGS.number, + repeat=ARGS.repeat, + min_repeat_ms=ARGS.min_repeat_ms, + enable_cpu_cache_flush=ARGS.cpu_flush, + timeout=ARGS.rpc_config.session_timeout_sec, + ) if ARGS.target.kind.name == "llvm": hardware_params = auto_scheduler.HardwareParams( @@ -127,37 +141,42 @@ def main(): ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") - task = auto_scheduler.SearchTask( - func=workload_func, - args=params, - target=ARGS.target, - hardware_params=hardware_params, - ) - runner = auto_scheduler.RPCRunner( - key=ARGS.rpc_key, - host=ARGS.rpc_host, - port=ARGS.rpc_port, - n_parallel=cpu_count(logical=True), - number=ARGS.number, - repeat=ARGS.repeat, - min_repeat_ms=ARGS.min_repeat_ms, - enable_cpu_cache_flush=ARGS.cpu_flush, - # todo(zxybazh): set session timeout to 60 same as MS - ) - # Inspect the computational graph - print("Computational DAG:") - print(task.compute_dag) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, - runner=runner, - ) - print("Running AutoTuning:") - task.tune(tune_option, adaptive_training=ARGS.adaptive_training) + describe() + print(f"Workload: {ARGS.workload}") + with ms.Profiler() as profiler: + # Same as MetaSchedule Tune TE + # Does not count ApplyHistoryBest time + + workload_func, params = CONFIGS[ARGS.workload] + params = params[0] # type: ignore + workload_func = auto_scheduler.register_workload(workload_func) + + task = auto_scheduler.SearchTask( + func=workload_func, + args=params, + target=ARGS.target, + hardware_params=hardware_params, + ) + # Inspect the computational graph + print("Computational DAG:") + print(task.compute_dag) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, + runner=runner, + ) + if ARGS.num_trials > 0: + print("Running AutoTuning:") + task.tune(tune_option, adaptive_training=ARGS.adaptive_training) + + print("Tuning Time:") + print(profiler.table()) + print("History Best:") print(task.print_best(log_file)) + sch, args = task.apply_best(log_file) print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True)) diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py index e479cb725428a..2fdb9b93494f9 100644 --- a/python/tvm/meta_schedule/cost_model/cost_model.py +++ b/python/tvm/meta_schedule/cost_model/cost_model.py @@ -73,7 +73,7 @@ def update( _ffi_api.CostModelUpdate(self, context, candidates, results) # type: ignore # pylint: disable=no-member def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray: - """Update the cost model given running results. + """Predict normalized score with the cost model. Parameters ---------- diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py index 3ba007d9a4d37..e203848c2cbb8 100644 --- a/python/tvm/meta_schedule/testing/custom_builder_runner.py +++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py @@ -17,7 +17,7 @@ """Customized builder and runner methods""" # pylint: disable=import-outside-toplevel -from typing import TYPE_CHECKING, Callable, Dict, List +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union if TYPE_CHECKING: import numpy as np # type: ignore @@ -25,6 +25,7 @@ from tvm.meta_schedule.runner import EvaluatorConfig, RPCConfig from tvm.runtime import Device, Module, NDArray from tvm.target import Target + from tvm.runtime.vm import Executable def build_relay( @@ -143,10 +144,11 @@ def run_with_graph_executor( def run_module_via_rpc( rpc_config: "RPCConfig", - lib: "Module", + lib: Union["Module", "Executable"], dev_type: str, args: Dict[str, "np.ndarray"], continuation: Callable, + backend: Optional[str] = "graph", ): """Execute a tvm.runtime.Module on RPC remote""" # pylint: disable=import-outside-toplevel @@ -160,13 +162,15 @@ def run_module_via_rpc( with tempfile.TemporaryDirectory() as tmp_dir: filename = os.path.join(tmp_dir, "tvm_tmp_mod." + tar.output_format) + if backend == "vm": + code, lib = lib.save() lib.export_library(filename, tar) session = rpc_config.connect_server() session.upload(filename) _, filename = os.path.split(filename) rt_mod = session.load_module(filename) + if backend == "vm": + rt_mod = session.get_function("runtime.Load_Executable")(code, rt_mod) dev = session.device(dev_type=dev_type, dev_id=0) - nd_args = {} - for arg_key, arg_value in args.items(): - nd_args[arg_key] = ndarray.array(arg_value, dev) + nd_args = {k: ndarray.array(v, dev) for k, v in args.items()} return continuation(rt_mod, dev, nd_args) diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py index 8ae9ab1ed07df..6d473ed3237c6 100644 --- a/python/tvm/meta_schedule/testing/tune_onnx.py +++ b/python/tvm/meta_schedule/testing/tune_onnx.py @@ -15,18 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import logging - -from distutils.util import strtobool -import numpy as np # type: ignore import onnx # type: ignore + import tvm from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.relay.frontend import from_onnx from tvm.support import describe +from .tune_utils import generate_input_data, create_timer def _parse_args(): @@ -93,17 +93,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -127,14 +133,19 @@ def _parse_args(): def main(): describe() print(f"Workload: {ARGS.model_name}") + onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: - print(f" input_name: {item['name']}") + print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } + runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( @@ -145,6 +156,7 @@ def main(): ), alloc_repeat=1, ) + with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, @@ -159,68 +171,18 @@ def main(): runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, + backend=ARGS.backend, ) print("Tuning Time:") print(profiler.table()) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - input_data = {} - for item in ARGS.input_shape: - input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"] - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype - ) - - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py index daef48daa22f3..8010e36fd656f 100644 --- a/python/tvm/meta_schedule/testing/tune_relay.py +++ b/python/tvm/meta_schedule/testing/tune_relay.py @@ -15,16 +15,16 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import json import logging -from distutils.util import strtobool -import numpy as np # type: ignore import tvm from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network +from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer from tvm.support import describe @@ -91,17 +91,23 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -125,17 +131,21 @@ def _parse_args(): def main(): describe() print(f"Workload: {ARGS.workload}") + mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} - input_data = {} + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + } for input_name, input_shape in input_info.items(): - print(f" input_name: {input_name}") + print(f" input_name : {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") + runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( @@ -146,6 +156,7 @@ def main(): ), alloc_repeat=1, ) + with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, @@ -160,66 +171,19 @@ def main(): runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, + backend=ARGS.backend, ) + print("Tuning Time:") print(profiler.table()) - graph, rt_mod, params = lib.graph_json, lib.lib, lib.params - for input_name, input_shape in input_info.items(): - if input_dtype.startswith("float"): - input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype) - else: - input_data[input_name] = np.random.randint( - low=0, high=10000, size=input_shape, dtype=input_dtype - ) - - def f_timer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.graph_executor import GraphModule - - # pylint: enable=import-outside-toplevel - - mod = GraphModule(rt_mod["default"](dev)) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - ftimer = mod.module.time_evaluator( - "run", - dev, - min_repeat_ms=500, - repeat=3, - ) - results = list(np.array(ftimer().results) * 1000.0) # type: ignore - print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, - continuation=f_timer, - ) - - def f_per_layer(rt_mod, dev, input_data): - # pylint: disable=import-outside-toplevel - from tvm.contrib.debugger.debug_executor import create - - # pylint: enable=import-outside-toplevel - mod = create(graph, rt_mod, dev) - for input_name, input_value in input_data.items(): - mod.set_input(input_name, input_value) - graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] - graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) - print("|graph_nodes| = ", len(graph_nodes)) - print("|graph_time| = ", len(graph_time)) - graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)} - for k, v in graph_nodes_time.items(): - print(f"{k} : {v:.3f}") - - run_module_via_rpc( - rpc_config=ARGS.rpc_config, - lib=rt_mod, - dev_type=ARGS.target.kind.name, - args=input_data, - continuation=f_per_layer, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, ) diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py index e579c561adaa1..d54d92048ee62 100644 --- a/python/tvm/meta_schedule/testing/tune_te.py +++ b/python/tvm/meta_schedule/testing/tune_te.py @@ -15,14 +15,14 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring +from distutils.util import strtobool import argparse import logging -from distutils.util import strtobool from typing import Optional import tvm -from tvm import meta_schedule as ms from tvm import tir +from tvm import meta_schedule as ms from tvm.meta_schedule.testing.te_workload import create_te_workload from tvm.support import describe @@ -80,17 +80,17 @@ def _parse_args(): default=100, ) args.add_argument( - "--cpu-flush", + "--adaptive-training", type=lambda x: bool(strtobool(x)), - required=True, + required=False, help="example: True / False", + default=True, ) args.add_argument( - "--adaptive-training", + "--cpu-flush", type=lambda x: bool(strtobool(x)), - required=False, help="example: True / False", - default=True, + required=True, ) parsed = args.parse_args() parsed.target = tvm.target.Target(parsed.target) @@ -138,8 +138,10 @@ def main(): task_name=ARGS.workload, work_dir=ARGS.work_dir, ) + print("Tuning Time:") print(profiler.table()) + if sch is None: print("No valid schedule found!") else: diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py new file mode 100644 index 0000000000000..aad8496a4661f --- /dev/null +++ b/python/tvm/meta_schedule/testing/tune_utils.py @@ -0,0 +1,194 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Testing utility functions in meta schedule""" +from typing import Callable, Optional, Union, List, Dict +from statistics import median +import json +import warnings +import numpy as np # type: ignore + +import tvm +from tvm.runtime import NDArray + + +def generate_input_data( + input_shape: List[int], + input_dtype: str, + *, + low: Optional[int] = None, + high: Optional[int] = None, +) -> np.ndarray: + """Generate input date with given shape and data type. + + Parameters + ---------- + input_shape : List[int] + The shape of the input data. + input_dtype : str + The data type of the input date. + + Returns + ------- + input_data : np.ndarray + The generated input data with given shape and data type in numpy ndarray. + """ + if input_dtype.startswith("float"): + return np.random.uniform(size=input_shape).astype(input_dtype) + if input_dtype in ["uint8", "int8"]: + return np.random.randint( + low=0, + high=127, + size=input_shape, + dtype="int32", # TODO(zxybazh): fix the datatype when int8 / uint8 is supported better + ) + if input_dtype in ["int32", "int64"]: + if low is None or high is None: + warnings.warn( + "Model input value range for shape {input_shape} of {input_dtype} is not set!" + ) + return np.random.randint( + low=0 if low is None else low, + high=10000 if high is None else high, + size=input_shape, + dtype=input_dtype, + ) + raise ValueError("Unsupported input datatype!") + + +def create_timer(backend: str) -> Callable: + """Create a function to run and benchmark the performance of whole given runtime module, + or Executable in relay vm. + + Parameters + ---------- + backend : str + The backend to use, graph / vm. + + Returns + ------- + func : Callable + The function to benchmark the workload. + """ + + def f_timer( + rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable], + dev: tvm.device, + input_data: Dict[str, NDArray], + ) -> None: + """Run and benchmark the given runtime module, print out the result. + + Parameters + ---------- + rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable] + The runtime module or vm executable. + dev : tvm.device + The device type to run workload. + input_data : Dict[str, np.ndarray] + The input data as a dictionary. + """ + from tvm.contrib.graph_executor import GraphModule # pylint:disable=import-outside-toplevel + from tvm.runtime.vm import VirtualMachine # pylint:disable=import-outside-toplevel + + try: + if backend == "vm": + vm = VirtualMachine(rt_mod, dev) # pylint: disable=invalid-name + ftimer = vm.benchmark( + dev, min_repeat_ms=500, repeat=5, number=1, end_to_end=False, **input_data + ) + elif backend == "graph": + mod = GraphModule(rt_mod["default"](dev)) + for input_name, input_value in input_data.items(): + mod.set_input(input_name, input_value) + ftimer = mod.module.time_evaluator( + "run", dev, min_repeat_ms=500, repeat=5, number=1 + )() + else: + raise ValueError(f"Backend {backend} not supported in f_timer!") + + results = list(np.array(ftimer.results) * 1000.0) # type: ignore + + print("Running time in time_evaluator: ", results) + print("-------------------------------") + print(f" Min (ms) : {min(results)}") + print(f" Max (ms) : {max(results)}") + print(f" Median (ms) : {median(results)}") + print(f"Average (ms) : {sum(results) / len(results)}") + except Exception as exc: # pylint: disable=broad-except + print( + f"Run module f_timer via RPC failed, exception: {exc}", + ) + + return f_timer + + +def create_time_per_layer(graph: str) -> Callable: + """Create a function to run and benchmark the per-layer performance of given runtime module, + given the graph output of the module from graph compiler. + + Parameters + ---------- + graph : str + The json format graph output of the module from graph compiler. + + Returns + ------- + func : Callable + The function using the json format graph. + """ + + def f_time_per_layer( + rt_mod: tvm.runtime.Module, + dev: tvm.device, + input_data: Dict[str, NDArray], + ) -> None: + """Run and benchmark the per-layer performance of given runtime module, + print out the result. + + Parameters + ---------- + rt_mod : tvm.runtime.Module + The runtime module. + dev : tvm.device + The device type to run workload. + input_data : Dict[str, np.ndarray] + The input data as a dictionary. + """ + # pylint:disable=import-outside-toplevel + from tvm.contrib.debugger.debug_executor import create + + # pylint:enable=import-outside-toplevel + + try: + mod = create(graph, rt_mod, dev) + for input_name, input_value in input_data.items(): + mod.set_input(input_name, input_value) + graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] + graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) + + print("Running time of each layer:") + print("---------------------------") + print("|graph_nodes| = ", len(graph_nodes)) + print("|graph_time| = ", len(graph_time)) + + for k, v in zip(graph_nodes, graph_time): + print(k, float(v) * 1e6, "us") + except Exception as exc: # pylint: disable=broad-except + print( + f"Run module f_time_per_layer via RPC failed, exception: {exc}", + ) + + return f_time_per_layer diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py index bdd3852e40a32..0d011b726473b 100644 --- a/python/tvm/meta_schedule/testing/utils.py +++ b/python/tvm/meta_schedule/testing/utils.py @@ -16,13 +16,12 @@ # under the License. """Testing utility functions in meta schedule""" from typing import Callable, Dict, Optional, Union - -from tvm import meta_schedule as ms from tvm.ir import IRModule from tvm.relay import Function as RelayFunc from tvm.runtime import NDArray from tvm.target import Target from tvm.tir import Schedule +from tvm import meta_schedule as ms def apply_fixed_schedules( diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index fabf14ab23c74..cd40429d16840 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -24,7 +24,7 @@ from tvm.ir import IRModule from tvm.ir.transform import PassContext -from tvm.runtime import Module, NDArray +from tvm.runtime import Module, NDArray, vm from tvm.target import Target from tvm.te import Tensor, create_prim_func from tvm.tir import PrimFunc, Schedule @@ -346,8 +346,9 @@ def tune_extracted_tasks( cost_model=cost_model, measure_callbacks=measure_callbacks, ) - task_scheduler.tune() - cost_model.save(osp.join(work_dir, "cost_model.xgb")) + if config.max_trials_global > 0: + task_scheduler.tune() + cost_model.save(osp.join(work_dir, "cost_model.xgb")) return database @@ -516,6 +517,7 @@ def tune_relay( config: TuneConfig, work_dir: str, *, + backend: str = "graph", params: Optional[Dict[str, NDArray]] = None, builder: Optional[Builder] = None, runner: Optional[Runner] = None, @@ -527,7 +529,7 @@ def tune_relay( postprocs: Optional[FnPostproc] = None, mutator_probs: Optional[FnMutatorProb] = None, num_threads: Optional[int] = None, -) -> Module: +) -> Union[Module, vm.Executable]: """Tune a TIR IRModule with a given target. Parameters @@ -552,15 +554,16 @@ def tune_relay( The database to use. measure_callbacks : Optional[List[MeasureCallback]] The callbacks used during tuning. + backend : str = "graph" + The backend to use for relay compilation(graph / vm). Returns ------- - lib : Module - The built runtime module for the given relay workload. + lib : Union[Module, tvm.runtime.vm.Executable] + The built runtime module or vm Executable for the given relay workload. """ # pylint: disable=import-outside-toplevel - from tvm.relay import build as relay_build - + from tvm import relay from .relay_integration import extract_task_from_relay # pylint: disable=protected-access, enable=import-outside-toplevel @@ -584,6 +587,7 @@ def tune_relay( mutator_probs=mutator_probs, num_threads=num_threads, ) + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] with Profiler.timeit("ApplyHistoryBest"): with target, autotvm_silencer(), ApplyHistoryBest(database): with PassContext( From 522c8cc955895e8d01eb1d8f387b2cb614349450 Mon Sep 17 00:00:00 2001 From: TerranceLiang <11499470+terrance-liang@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:15:26 +0800 Subject: [PATCH 017/111] typo fix (#11958) Co-authored-by: Terrance Liang --- python/tvm/relay/testing/yolo_detection.py | 2 +- src/relay/op/tensor/transform.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/testing/yolo_detection.py b/python/tvm/relay/testing/yolo_detection.py index 949d024bd86fa..f486e0d7e874c 100644 --- a/python/tvm/relay/testing/yolo_detection.py +++ b/python/tvm/relay/testing/yolo_detection.py @@ -273,7 +273,7 @@ def show_detections(im, dets, thresh, names, classes): valid, detection = get_detections(im, det, thresh, names, classes) if valid: print( - "class:{} left:{} right:{} top:{} bottom:{}".format( + "class:{} left:{} top:{} right:{} bottom:{}".format( detection["labelstr"], detection["left"], detection["top"], diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 3c0451a953aae..4d5f52e61cf0d 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -792,8 +792,8 @@ bool ReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, data_shape_str << (iter != data_shape.begin() ? "," : "") << *iter; } ICHECK_EQ(oshape_sum, data_shape_sum) - << "Input tensor shape(" << oshape_str.str() << ") and reshaped shape(" - << data_shape_str.str() << ") are not compatible!"; + << "Input tensor shape(" << data_shape_str.str() << ") and reshaped shape(" + << oshape_str.str() << ") are not compatible!"; } reporter->Assign(types[1], TensorType(oshape, data->dtype)); From e7851ed763cd9e7e64c1e298908297d3f4ba93c7 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Thu, 30 Jun 2022 15:16:49 +0800 Subject: [PATCH 018/111] fix print attr of null node (#11959) --- src/printer/tvmscript_printer.cc | 72 ++++++++++++++++---------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 7949ee15a54c3..725e105c016a2 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -945,47 +945,49 @@ Doc TVMScriptPrinter::VisitStmt_(const LetStmtNode* op) { Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) { Doc doc; - // merge attr with realize when possible - if (op->node->IsInstance() && op->attr_key == "realize_scope" && - op->body->IsInstance()) { - const auto* realize = Downcast(op->body).get(); - if (realize->buffer.same_as(op->node)) { - if (current_num_ != num_child_ - 1) { - doc << "with " << tir_prefix_ << ".realize(" << Print(realize->buffer) - << Print(realize->bounds) << ", " << Print(op->value); - if (!is_one(realize->condition)) { - doc << ", " << Print(realize->condition); + if (op->node.defined()) { + // merge attr with realize when possible + if (op->node->IsInstance() && op->attr_key == "realize_scope" && + op->body->IsInstance()) { + const auto* realize = Downcast(op->body).get(); + if (realize->buffer.same_as(op->node)) { + if (current_num_ != num_child_ - 1) { + doc << "with " << tir_prefix_ << ".realize(" << Print(realize->buffer) + << Print(realize->bounds) << ", " << Print(op->value); + if (!is_one(realize->condition)) { + doc << ", " << Print(realize->condition); + } + doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(realize->body)); + } else { + doc << tir_prefix_ << ".realize(" << Print(realize->buffer) << Print(realize->bounds) + << ", " << Print(op->value); + if (!is_one(realize->condition)) { + doc << ", " << Print(realize->condition); + } + doc << ")" << Doc::NewLine() << PrintBody(realize->body); } - doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(realize->body)); + return doc; + } + } + // concise thread env + if (op->node->IsInstance() && + (op->attr_key == "thread_extent" || op->attr_key == "virtual_thread")) { + const auto* iter_var = Downcast(op->node).get(); + var_not_in_headers_.insert(iter_var->var.get()); + var_env_map_[iter_var->var] = iter_var->thread_tag; + if (current_num_ != num_child_ - 1) { + doc << "with " << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", " + << Print(op->value) << "):"; + doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); } else { - doc << tir_prefix_ << ".realize(" << Print(realize->buffer) << Print(realize->bounds) - << ", " << Print(op->value); - if (!is_one(realize->condition)) { - doc << ", " << Print(realize->condition); - } - doc << ")" << Doc::NewLine() << PrintBody(realize->body); + doc << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", " << Print(op->value) + << ")"; + doc << Doc::NewLine() << PrintBody(op->body); } + TryDeallocVar(iter_var->var); return doc; } } - // concise thread env - if (op->node->IsInstance() && - (op->attr_key == "thread_extent" || op->attr_key == "virtual_thread")) { - const auto* iter_var = Downcast(op->node).get(); - var_not_in_headers_.insert(iter_var->var.get()); - var_env_map_[iter_var->var] = iter_var->thread_tag; - if (current_num_ != num_child_ - 1) { - doc << "with " << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", " - << Print(op->value) << "):"; - doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); - } else { - doc << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", " << Print(op->value) - << ")"; - doc << Doc::NewLine() << PrintBody(op->body); - } - TryDeallocVar(iter_var->var); - return doc; - } // default if (current_num_ != num_child_ - 1) { doc << "with " << tir_prefix_ << ".attr(" << Print(op->node) << ", " From 80a0c6c53dc7e3aca2bc52755fabbad76cbac35a Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 30 Jun 2022 13:40:09 +0100 Subject: [PATCH 019/111] [microNPU] Fix offloading incompatible average pool (#11469) Fixes offloading a few corner cases of average pooling. Specifically not offloading nn.avg_pool2d when: * The attribute count_include_pad=True * Padding exceeds the dimensions [3, 3, 4, 4] * The pool size is greater than [8, 8] when the pool uses padding Change-Id: I7be546e28ebe1f17482f3ed3cee56996a71bfcd1 --- python/tvm/relay/op/contrib/ethosu.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py index 806bf6dce2e89..4c3dcc2fc45a8 100644 --- a/python/tvm/relay/op/contrib/ethosu.py +++ b/python/tvm/relay/op/contrib/ethosu.py @@ -613,7 +613,7 @@ class AvgPool2DParams: composite_name = "ethos-u.avgpool2d" # The hardware only supports padding upto the numbers as follows - padding_bounds = [127, 127, 128, 128] + padding_bounds = [3, 3, 4, 4] def __init__(self, func_body: Call): clip = None @@ -632,6 +632,7 @@ def __init__(self, func_body: Call): self.pool_shape = attrs.pool_size self.strides = attrs.strides self.padding = attrs.padding + self.count_include_pad = attrs.count_include_pad self.activation = clip self.pooling_type = "AVG" @@ -648,10 +649,17 @@ def is_valid(self): return False if not check_batch_size(self.ifm): return False + if self.count_include_pad: + return False if not check_padding(self.padding, self.padding_bounds): return False if not check_pool_shape(self.pool_shape): return False + # Averge pool with padding only supports 1 <= pool_shape <= 8 + if list(self.padding) != [0, 0, 0, 0] and ( + self.pool_shape[0] > 8 or self.pool_shape[1] > 8 + ): + return False return True From 915c23b61b34604b19217759f320c84d3aa60605 Mon Sep 17 00:00:00 2001 From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com> Date: Thu, 30 Jun 2022 20:06:27 +0530 Subject: [PATCH 020/111] [TOPI] [Hexagon] Batch flatten slice op initial version (#11522) * [TOPI] [Hexagon] Batch flatten slice op initial version * Fix lint errors * Fix more lint errors * Fix lint warnings * Fix review comments * Update tests to use util functions * Update __init__.py * Fix review comments --- python/tvm/topi/hexagon/slice_ops/__init__.py | 1 + .../topi/hexagon/slice_ops/batch_flatten.py | 77 +++++++++++++ python/tvm/topi/hexagon/utils.py | 14 +++ .../contrib/test_hexagon/infrastructure.py | 6 ++ .../test_hexagon/topi/test_batch_flatten.py | 101 ++++++++++++++++++ 5 files changed, 199 insertions(+) create mode 100644 python/tvm/topi/hexagon/slice_ops/batch_flatten.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_batch_flatten.py diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index 3340f835200b9..5b5c0b84214eb 100755 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -20,5 +20,6 @@ from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule from .add_subtract_multiply import * from .argmax import argmax_compute, argmax_schedule +from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule from .softmax_slice import * from .clip import * diff --git a/python/tvm/topi/hexagon/slice_ops/batch_flatten.py b/python/tvm/topi/hexagon/slice_ops/batch_flatten.py new file mode 100644 index 0000000000000..6dc0914e91b42 --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/batch_flatten.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Hexagon slice batch flatten compute and schedule""" +from tvm import te, tir, topi +from ..utils import get_layout_transform_fn + + +def batch_flatten_compute(inp: te.Tensor) -> te.Tensor: + """Compute for slice batch flatten op for hexagon. + This op makes the following assumptions: + 1. This op is written for a sliced batch flatten operation. + 2. The input is assumed to be in NHWC layout. + + Parameters + ---------- + Input : te.Tensor + Input activations padded for inner dimension size + Returns + ------- + Output : te.Tensor + Output of applying batch flatten operation on input + """ + return topi.nn.flatten(inp) + + +def batch_flatten_stir_schedule( + out: te.Tensor, + inp: te.Tensor, + out_layout: str, + in_layout: str, +) -> tir.Schedule: + """STIR schedule definition for the compute of batch flatten compute. + Parameters + ---------- + outputs : te.Tensor + The output tensor as returned by a call to batch_flatten_compute + input : te.Tensor + Input tensor to batch_flatten + out_layout: typing.Callable + The transformation function definition for the expected output layout + in_layout: typing.Callable + The transformation function definition for the input layout + Returns + ------- + sch : tvm.tir.Schedule + The STIR schedule for slice batch flatten compute + """ + + batch_flatten_func = te.create_prim_func([inp, out]) + sch = tir.Schedule(batch_flatten_func, debug_mask="all") + compute = sch.get_block("compute") + + sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout)) + sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout)) + i, j = sch.get_loops(compute) + jout, channel = sch.split(j, [None, inp.shape[3]]) + height, width = sch.split(jout, [inp.shape[1], inp.shape[2]]) + channelo, channeli = sch.split(channel, [None, 1024]) + channelio, channelii = sch.split(channeli, [None, 64]) + sch.reorder(i, height, width, channelo, channelio, channelii) + sch.vectorize(channelii) + return sch diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index 95b25cc5a73b5..092bce87119ac 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -67,6 +67,16 @@ def nc_512c_2d(n, c): return [n, c // 512, te.AXIS_SEPARATOR, c % 512] +def nhwc_1024c_2d(n, h, w, c): + """Return index map for nhwc_1024 2d layout""" + return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024] + + +def nc_1024_2d(n, c): + """Return index map for nc_1024 2d layout""" + return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024] + + def get_layout_transform_fn(layout): """Return index map function as per the layout string""" if layout == "nhwc-8h2w32c2w-2d": @@ -77,6 +87,10 @@ def get_layout_transform_fn(layout): return n11c_1024c_2d if layout == "n11c-1024c-1d": return n11c_1024c_1d + if layout == "nhwc-1024c-2d": + return nhwc_1024c_2d + if layout == "nc-1024-2d": + return nc_1024_2d if layout == "nhw-32h16w-2d": return nhw_32h16w_2d if layout == "nhwc-4h4w32c-2d": diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index c1d2b4046372f..53351854a06a3 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -245,6 +245,12 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): n, h, w, c = arr_np.shape assert h == 1 and w == 1, "The size of h and w must be 1" return arr_np.reshape([n, 1, 1, c // 1024, 1024]) + if new_layout == "nc-1024-2d": + N, C = arr_np.shape + return arr_np.reshape([N, C // 1024, 1024]) + if new_layout == "nhwc-1024c-2d": + N, H, W, C = arr_np.shape + return arr_np.reshape([N, H, W, C // 1024, 1024]) raise RuntimeError(f"Unexpected new_layout '{new_layout}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py b/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py new file mode 100644 index 0000000000000..3a056116d45c1 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import pytest + +import tvm +import tvm.testing +import tvm.topi.hexagon.slice_ops as sl +from tvm import te, topi +from tvm.contrib.hexagon.build import HexagonLauncher +from tvm.topi import testing + +from ..infrastructure import allocate_hexagon_array, transform_numpy + + +class BaseTestBatchFlatten: + input_shape = tvm.testing.parameter( + (1, 1, 1, 2048), + (1, 2, 4, 2048), + (1, 8, 8, 1024), + (2, 4, 8, 1024), + (2, 3, 5, 2048), + ) + input_layout, input_axis_sep = tvm.testing.parameters(("nhwc-1024c-2d", [4])) + output_layout, output_axis_sep = tvm.testing.parameters(("nc-1024-2d", [2])) + data_type = tvm.testing.parameter("float16") + + +class TestBatchFlatten(BaseTestBatchFlatten): + @tvm.testing.fixture + def output_shape(self, input_shape): + return input_shape[0], input_shape[1] * input_shape[2] * input_shape[3] + + @tvm.testing.requires_hexagon + def test_batch_flatten( + self, + data_type, + input_shape, + input_layout, + input_axis_sep, + output_shape, + output_layout, + output_axis_sep, + hexagon_session, + ): + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + A = te.placeholder(input_shape, name="A", dtype=data_type) + D = sl.batch_flatten_compute(A) + tir_s = sl.batch_flatten_stir_schedule( + D, + A, + output_layout, + input_layout, + ) + func_name = "batch_flatten" + with tvm.transform.PassContext(opt_level=3): + runtime_module = tvm.build(tir_s.mod, target=target, name=func_name) + + mod = hexagon_session.load_module(runtime_module) + + a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type) + ref = np.reshape(a_numpy, output_shape) + + input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout) + ref_np_transformed = transform_numpy(ref, "nhwc", output_layout) + + a_tvm = allocate_hexagon_array( + hexagon_session.device, + data=input_np_transformed, + axis_separators=input_axis_sep, + mem_scope="global.vtcm", + ) + output = allocate_hexagon_array( + hexagon_session.device, + ref_np_transformed.shape, + data_type, + axis_separators=output_axis_sep, + mem_scope="global.vtcm", + ) + mod(a_tvm, output) + np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0) + + +if __name__ == "__main__": + tvm.testing.main(pytest.main(sys.argv)) From 3425ed846308a456f98404c79f6df1693bed6377 Mon Sep 17 00:00:00 2001 From: Valery Chernov Date: Thu, 30 Jun 2022 17:53:36 +0300 Subject: [PATCH 021/111] [VM] class Executable does not export symbols to dll (#11963) * class Executable of VM exports symbols to dll * restart CI Co-authored-by: Valery Chernov --- include/tvm/runtime/vm/executable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h index 774bca1e2d280..2405b3c0ba8c5 100644 --- a/include/tvm/runtime/vm/executable.h +++ b/include/tvm/runtime/vm/executable.h @@ -54,7 +54,7 @@ struct VMFunction; * used by the virtual machine. * - Code section, handling the VM functions and bytecode. */ -class Executable : public ModuleNode { +class TVM_DLL Executable : public ModuleNode { public: /*! * \brief Get a PackedFunc from an executable module. From 1aec41aa0c2d88d248cfd437e50bfe9a48e47988 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 30 Jun 2022 16:39:42 +0100 Subject: [PATCH 022/111] [ETHOSN][CPP-RPC] Link NPU runtime in CPP RPC build (#11946) When building the CPP RPC package with the NPU enabled, `link_directories` fails to find the NPU runtime libraries. This is presumably because the TVM runtime is linked with the PRIVATE option in: https://github.com/apache/tvm/blob/main/CMakeLists.txt#L601. Therefore working around this by following the precedent of other libraries such as Hexagon and Open CL. Change-Id: Iba2fbc245df18147e3b564ba807ca78c9cc8461d --- apps/cpp_rpc/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt index 2fb8923d39c31..fc3aafcc4443a 100644 --- a/apps/cpp_rpc/CMakeLists.txt +++ b/apps/cpp_rpc/CMakeLists.txt @@ -57,6 +57,14 @@ if (BUILD_FOR_ANDROID AND USE_HEXAGON) list(APPEND TVM_RPC_LINKER_LIBS cdsprpc log) endif() +if(USE_ETHOSN) + if (ETHOSN_RUNTIME_LIBRARY) + list(APPEND TVM_RPC_LINKER_LIBS ${ETHOSN_RUNTIME_LIBRARY}) + else() + message(WARNING "Could not locate Arm(R) Ethos(TM)-N runtime library components") + endif() +endif() + if(BUILD_STATIC_RUNTIME) list(APPEND TVM_RPC_LINKER_LIBS -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive) else() From c0f4bf72b6ee30648ef78ce865afc733c95fe98c Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 30 Jun 2022 09:10:12 -0700 Subject: [PATCH 023/111] [ci] Redirect sphinx-gallery URLs to S3 (#11839) Co-authored-by: driazati --- gallery/how_to/compile_models/from_coreml.py | 6 ++ gallery/how_to/compile_models/from_darknet.py | 6 ++ gallery/how_to/compile_models/from_keras.py | 6 ++ gallery/how_to/compile_models/from_mxnet.py | 6 ++ gallery/how_to/compile_models/from_oneflow.py | 6 ++ gallery/how_to/compile_models/from_onnx.py | 6 ++ gallery/how_to/compile_models/from_paddle.py | 6 ++ gallery/how_to/compile_models/from_pytorch.py | 6 ++ .../how_to/compile_models/from_tensorflow.py | 6 ++ gallery/how_to/compile_models/from_tflite.py | 6 ++ .../deploy_models/deploy_model_on_android.py | 6 ++ .../deploy_models/deploy_model_on_rasp.py | 6 ++ .../deploy_object_detection_pytorch.py | 6 ++ .../deploy_models/deploy_prequantized.py | 6 ++ .../deploy_prequantized_tflite.py | 6 ++ .../how_to/deploy_models/deploy_quantized.py | 6 ++ gallery/how_to/deploy_models/deploy_sparse.py | 6 ++ .../deploy_models/deploy_ssd_gluoncv.py | 6 ++ .../extend_tvm/bring_your_own_datatypes.py | 6 ++ .../extend_tvm/low_level_custom_pass.py | 6 ++ gallery/how_to/extend_tvm/use_pass_infra.py | 6 ++ .../how_to/extend_tvm/use_pass_instrument.py | 6 ++ .../optimize_operators/opt_conv_cuda.py | 6 ++ .../optimize_operators/opt_conv_tensorcore.py | 6 ++ gallery/how_to/optimize_operators/opt_gemm.py | 6 ++ .../tune_conv2d_layer_cuda.py | 6 ++ .../tune_network_arm.py | 6 ++ .../tune_network_cuda.py | 6 ++ .../tune_network_mali.py | 6 ++ .../tune_network_x86.py | 6 ++ .../tune_sparse_x86.py | 6 ++ .../tune_with_autotvm/tune_conv2d_cuda.py | 6 ++ .../tune_with_autotvm/tune_relay_arm.py | 6 ++ .../tune_with_autotvm/tune_relay_cuda.py | 6 ++ .../tune_relay_mobile_gpu.py | 6 ++ .../tune_with_autotvm/tune_relay_x86.py | 6 ++ .../work_with_microtvm/micro_autotune.py | 6 ++ .../how_to/work_with_microtvm/micro_ethosu.py | 6 ++ .../work_with_microtvm/micro_reference_vm.py | 6 ++ .../how_to/work_with_microtvm/micro_tflite.py | 6 ++ gallery/how_to/work_with_relay/build_gcn.py | 6 ++ .../work_with_relay/using_external_lib.py | 6 ++ .../how_to/work_with_relay/using_relay_viz.py | 6 ++ .../how_to/work_with_schedules/extern_op.py | 6 ++ .../how_to/work_with_schedules/intrin_math.py | 8 +- .../how_to/work_with_schedules/reduction.py | 6 ++ gallery/how_to/work_with_schedules/scan.py | 6 ++ .../schedule_primitives.py | 6 ++ gallery/how_to/work_with_schedules/tedd.py | 6 ++ .../how_to/work_with_schedules/tensorize.py | 6 ++ .../work_with_schedules/tuple_inputs.py | 6 ++ gallery/tutorial/auto_scheduler_matmul_x86.py | 6 ++ gallery/tutorial/autotvm_matmul_x86.py | 6 ++ gallery/tutorial/autotvm_relay_x86.py | 6 ++ gallery/tutorial/cross_compilation_and_rpc.py | 6 ++ gallery/tutorial/install.py | 6 ++ gallery/tutorial/intro_topi.py | 6 ++ gallery/tutorial/introduction.py | 6 ++ gallery/tutorial/relay_quick_start.py | 6 ++ gallery/tutorial/tensor_expr_get_started.py | 6 ++ gallery/tutorial/tensor_ir_blitz_course.py | 6 ++ gallery/tutorial/tvmc_command_line_driver.py | 6 ++ gallery/tutorial/tvmc_python.py | 6 ++ python/tvm/testing/utils.py | 47 ++++++++++ tests/lint/check_request_hook.py | 92 +++++++++++++++++++ tests/scripts/request_hook/request_hook.py | 61 ++++++++++++ tests/scripts/task_lint.sh | 3 + 67 files changed, 582 insertions(+), 1 deletion(-) create mode 100644 tests/lint/check_request_hook.py create mode 100644 tests/scripts/request_hook/request_hook.py diff --git a/gallery/how_to/compile_models/from_coreml.py b/gallery/how_to/compile_models/from_coreml.py index 98d1969f3639d..96d2967947f6e 100644 --- a/gallery/how_to/compile_models/from_coreml.py +++ b/gallery/how_to/compile_models/from_coreml.py @@ -34,6 +34,12 @@ or please refer to official site https://github.com/apple/coremltools """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import tvm.relay as relay diff --git a/gallery/how_to/compile_models/from_darknet.py b/gallery/how_to/compile_models/from_darknet.py index 232058641ab00..c12a9e7e1574b 100644 --- a/gallery/how_to/compile_models/from_darknet.py +++ b/gallery/how_to/compile_models/from_darknet.py @@ -31,6 +31,12 @@ pip install opencv-python """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + # numpy and matplotlib import numpy as np import matplotlib.pyplot as plt diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py index 1db27799fe4c1..895a601ada0ad 100644 --- a/gallery/how_to/compile_models/from_keras.py +++ b/gallery/how_to/compile_models/from_keras.py @@ -34,6 +34,12 @@ or please refer to official site https://keras.io/#installation """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import tvm.relay as relay diff --git a/gallery/how_to/compile_models/from_mxnet.py b/gallery/how_to/compile_models/from_mxnet.py index 027e9e6eb757c..38084618628fd 100644 --- a/gallery/how_to/compile_models/from_mxnet.py +++ b/gallery/how_to/compile_models/from_mxnet.py @@ -35,6 +35,12 @@ or please refer to official installation guide. https://mxnet.apache.org/versions/master/install/index.html """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore # some standard imports import mxnet as mx import tvm diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py index f92f0b0f1e220..eb27c4b3e34b7 100644 --- a/gallery/how_to/compile_models/from_oneflow.py +++ b/gallery/how_to/compile_models/from_oneflow.py @@ -35,6 +35,12 @@ Currently, TVM supports OneFlow 0.7.0. Other versions may be unstable. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import os, math from matplotlib import pyplot as plt import numpy as np diff --git a/gallery/how_to/compile_models/from_onnx.py b/gallery/how_to/compile_models/from_onnx.py index 586c811aa627a..f0256bc7d3ae5 100644 --- a/gallery/how_to/compile_models/from_onnx.py +++ b/gallery/how_to/compile_models/from_onnx.py @@ -32,6 +32,12 @@ or please refer to official site. https://github.com/onnx/onnx """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import onnx import numpy as np import tvm diff --git a/gallery/how_to/compile_models/from_paddle.py b/gallery/how_to/compile_models/from_paddle.py index 9d67cbcdf9ff5..fecb1c48dafbf 100644 --- a/gallery/how_to/compile_models/from_paddle.py +++ b/gallery/how_to/compile_models/from_paddle.py @@ -30,6 +30,12 @@ or please refer to official site. https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tarfile import paddle import numpy as np diff --git a/gallery/how_to/compile_models/from_pytorch.py b/gallery/how_to/compile_models/from_pytorch.py index e8d0b4998f9e5..98b531fa6d6e3 100644 --- a/gallery/how_to/compile_models/from_pytorch.py +++ b/gallery/how_to/compile_models/from_pytorch.py @@ -41,6 +41,12 @@ be unstable. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import tvm from tvm import relay diff --git a/gallery/how_to/compile_models/from_tensorflow.py b/gallery/how_to/compile_models/from_tensorflow.py index 4563e245c0cf0..9a32397815efb 100644 --- a/gallery/how_to/compile_models/from_tensorflow.py +++ b/gallery/how_to/compile_models/from_tensorflow.py @@ -24,6 +24,12 @@ Please refer to https://www.tensorflow.org/install """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + # tvm, relay import tvm from tvm import te diff --git a/gallery/how_to/compile_models/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py index b720402366542..712269381f840 100644 --- a/gallery/how_to/compile_models/from_tflite.py +++ b/gallery/how_to/compile_models/from_tflite.py @@ -52,6 +52,12 @@ Below you can find an example on how to compile TFLite model using TVM. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore ###################################################################### # Utils for downloading and extracting zip files # ---------------------------------------------- diff --git a/gallery/how_to/deploy_models/deploy_model_on_android.py b/gallery/how_to/deploy_models/deploy_model_on_android.py index c7b610d5d5034..10e108239ee75 100644 --- a/gallery/how_to/deploy_models/deploy_model_on_android.py +++ b/gallery/how_to/deploy_models/deploy_model_on_android.py @@ -25,6 +25,12 @@ This is an example of using Relay to compile a keras model and deploy it on Android device. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import os import numpy as np from PIL import Image diff --git a/gallery/how_to/deploy_models/deploy_model_on_rasp.py b/gallery/how_to/deploy_models/deploy_model_on_rasp.py index de4ed9aff074c..ab5374d93dbf5 100644 --- a/gallery/how_to/deploy_models/deploy_model_on_rasp.py +++ b/gallery/how_to/deploy_models/deploy_model_on_rasp.py @@ -26,6 +26,12 @@ it on Raspberry Pi. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import tvm from tvm import te import tvm.relay as relay diff --git a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py index b5b0e4acf1f60..0d8d0f2867a2d 100644 --- a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py +++ b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py @@ -40,6 +40,12 @@ be unstable. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import tvm from tvm import relay from tvm import relay diff --git a/gallery/how_to/deploy_models/deploy_prequantized.py b/gallery/how_to/deploy_models/deploy_prequantized.py index caee2b3b415ae..fdb4de289d913 100644 --- a/gallery/how_to/deploy_models/deploy_prequantized.py +++ b/gallery/how_to/deploy_models/deploy_prequantized.py @@ -28,6 +28,12 @@ Once loaded, we can run compiled, quantized models on any hardware TVM supports. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################# # First, necessary imports from PIL import Image diff --git a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py index 830e2ab074662..494b4a9e219b4 100644 --- a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py +++ b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py @@ -42,6 +42,12 @@ """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ############################################################################### # Necessary imports # ----------------- diff --git a/gallery/how_to/deploy_models/deploy_quantized.py b/gallery/how_to/deploy_models/deploy_quantized.py index 2d9275796eb5c..24c7ce3331f54 100644 --- a/gallery/how_to/deploy_models/deploy_quantized.py +++ b/gallery/how_to/deploy_models/deploy_quantized.py @@ -27,6 +27,12 @@ Relay, quantize the Relay model and then perform the inference. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import tvm from tvm import te from tvm import relay diff --git a/gallery/how_to/deploy_models/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py index 56a5f1aafd1ce..b9a26e0d30532 100644 --- a/gallery/how_to/deploy_models/deploy_sparse.py +++ b/gallery/how_to/deploy_models/deploy_sparse.py @@ -70,6 +70,12 @@ sparse speed using fake weights to see the benefit of structured sparsity. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ############################################################################### # Load Required Modules # --------------------- diff --git a/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py index ebe18670c6a38..f39244a2eb037 100644 --- a/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py +++ b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py @@ -23,6 +23,12 @@ This article is an introductory tutorial to deploy SSD models with TVM. We will use GluonCV pre-trained SSD model and convert it to Relay IR """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te diff --git a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py index 1a48781e24336..479269a224a3f 100644 --- a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py +++ b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py @@ -52,6 +52,12 @@ ctypes.CDLL('my-datatype-lib.so', ctypes.RTLD_GLOBAL) """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################### # A Simple TVM Program # -------------------- diff --git a/gallery/how_to/extend_tvm/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py index ee96d8220cac3..0f99c72cee9cc 100644 --- a/gallery/how_to/extend_tvm/low_level_custom_pass.py +++ b/gallery/how_to/extend_tvm/low_level_custom_pass.py @@ -40,6 +40,12 @@ take a look at ``python/tvm/build_module.py`` to get some basics. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import numpy as np diff --git a/gallery/how_to/extend_tvm/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py index e38383e69011a..a41a26fc0b1e2 100644 --- a/gallery/how_to/extend_tvm/use_pass_infra.py +++ b/gallery/how_to/extend_tvm/use_pass_infra.py @@ -40,6 +40,12 @@ The same approach can be used for tir as well. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import numpy as np import tvm from tvm import te diff --git a/gallery/how_to/extend_tvm/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py index 036aa63e374f0..3079e2f0e7639 100644 --- a/gallery/how_to/extend_tvm/use_pass_instrument.py +++ b/gallery/how_to/extend_tvm/use_pass_instrument.py @@ -33,6 +33,12 @@ This tutorial demonstrates how developers can use ``PassContext`` to instrument passes. Please also refer to the :ref:`pass-infra`. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm import tvm.relay as relay from tvm.relay.testing import resnet diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py index 3d2caa0d31214..e5b452af66a95 100644 --- a/gallery/how_to/optimize_operators/opt_conv_cuda.py +++ b/gallery/how_to/optimize_operators/opt_conv_cuda.py @@ -30,6 +30,12 @@ """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################ # Preparation and Algorithm # ------------------------- diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py index ccfc7b9743aaa..4cc2b40b7b8c8 100644 --- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py +++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py @@ -27,6 +27,12 @@ """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################ # TensorCore Introduction # ----------------------- diff --git a/gallery/how_to/optimize_operators/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py index 920d7a87fabf9..d2ec711c2b29a 100644 --- a/gallery/how_to/optimize_operators/opt_gemm.py +++ b/gallery/how_to/optimize_operators/opt_gemm.py @@ -48,6 +48,12 @@ Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################################ # Preparation and Baseline # ------------------------ diff --git a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py index a4f7e22d89c44..5d173e38128eb 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py @@ -37,6 +37,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import os import numpy as np diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py index 9c5820c991e8a..09a1d0cea5208 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py @@ -46,6 +46,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import numpy as np import os diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py index b403c0aa84fbb..cc29f27ba22be 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py @@ -44,6 +44,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import numpy as np import tvm diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py index 2d1e515209528..8ac0b235d72ef 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py @@ -44,6 +44,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import numpy as np import tvm diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py index 6cb8d6f14cb9b..5a321104c8e42 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py @@ -45,6 +45,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import numpy as np import tvm diff --git a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py index 55ee76ef6c4f0..0a2ddbd1bd817 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py @@ -35,6 +35,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import os import numpy as np diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py index e3072773bf593..95d6dcb0a19c4 100644 --- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py +++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py @@ -28,6 +28,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # Install dependencies # -------------------- diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py index f072c5ddac935..ab278021d2cad 100644 --- a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py +++ b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py @@ -41,6 +41,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # Install dependencies # -------------------- diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py index b2af2e13f4fe4..459b2798c295a 100644 --- a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py +++ b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py @@ -39,6 +39,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # Install dependencies # -------------------- diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py index d3f4ec62fafc9..5a4f0c56d2e75 100644 --- a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py +++ b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py @@ -39,6 +39,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # Install dependencies # -------------------- diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py index 771220bb33149..6e46fbd8ffc88 100644 --- a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py +++ b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py @@ -28,6 +28,12 @@ get it to run, you will need to wrap the body of this tutorial in a :code:`if __name__ == "__main__":` block. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import os import numpy as np diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py index 613d92e1413e4..58c52508b7c2a 100644 --- a/gallery/how_to/work_with_microtvm/micro_autotune.py +++ b/gallery/how_to/work_with_microtvm/micro_autotune.py @@ -27,6 +27,12 @@ This tutorial explains how to autotune a model using the C runtime. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import os import json import numpy as np diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py index f55fad71dda10..8e37a0ea5ec4e 100644 --- a/gallery/how_to/work_with_microtvm/micro_ethosu.py +++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py @@ -37,6 +37,12 @@ TVM to offload operators to the Ethos(TM)-U55 where possible. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # Obtaining TVM # ------------- diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py index 9eacd9a963e1f..b87a7265649f2 100644 --- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py +++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py @@ -157,3 +157,9 @@ """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py index 3d871ba783ad8..dfe33eedac75b 100644 --- a/gallery/how_to/work_with_microtvm/micro_tflite.py +++ b/gallery/how_to/work_with_microtvm/micro_tflite.py @@ -25,6 +25,12 @@ model with Relay. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # .. note:: # If you want to run this tutorial on the microTVM Reference VM, download the Jupyter diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py index fcffbd77ff86b..8953ffc2e474c 100644 --- a/gallery/how_to/work_with_relay/build_gcn.py +++ b/gallery/how_to/work_with_relay/build_gcn.py @@ -118,6 +118,12 @@ def evaluate(data, logits): num_classes: int dimension of model output (Number of classes) """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore dataset = "cora" g, data = load_dataset(dataset) diff --git a/gallery/how_to/work_with_relay/using_external_lib.py b/gallery/how_to/work_with_relay/using_external_lib.py index 8b6957d1dbf6f..c018ee13c724e 100644 --- a/gallery/how_to/work_with_relay/using_external_lib.py +++ b/gallery/how_to/work_with_relay/using_external_lib.py @@ -31,6 +31,12 @@ To begin with, we import Relay and TVM. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import numpy as np diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py index b0132f40b9b51..2e68ce902899c 100644 --- a/gallery/how_to/work_with_relay/using_relay_viz.py +++ b/gallery/how_to/work_with_relay/using_relay_viz.py @@ -35,6 +35,12 @@ For more details, please refer to :py:mod:`tvm.contrib.relay_viz`. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore from typing import ( Dict, Union, diff --git a/gallery/how_to/work_with_schedules/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py index a0aa5d72450c0..ad741a08d54c5 100644 --- a/gallery/how_to/work_with_schedules/extern_op.py +++ b/gallery/how_to/work_with_schedules/extern_op.py @@ -31,6 +31,12 @@ """ from __future__ import absolute_import, print_function + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import numpy as np diff --git a/gallery/how_to/work_with_schedules/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py index 535563bfb5306..5a8732abd7764 100644 --- a/gallery/how_to/work_with_schedules/intrin_math.py +++ b/gallery/how_to/work_with_schedules/intrin_math.py @@ -29,7 +29,13 @@ the interface via TVM's intrinsic API. """ from __future__ import absolute_import, print_function -import numpy as np + + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignoreimport numpy as np import tvm from tvm import te diff --git a/gallery/how_to/work_with_schedules/reduction.py b/gallery/how_to/work_with_schedules/reduction.py index 164f36dafc798..432e9cd143b18 100644 --- a/gallery/how_to/work_with_schedules/reduction.py +++ b/gallery/how_to/work_with_schedules/reduction.py @@ -27,6 +27,12 @@ """ from __future__ import absolute_import, print_function + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm import tvm.testing from tvm import te diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py index 3f3d7e91ee1c1..d21673acd9e49 100644 --- a/gallery/how_to/work_with_schedules/scan.py +++ b/gallery/how_to/work_with_schedules/scan.py @@ -24,6 +24,12 @@ """ from __future__ import absolute_import, print_function + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm import tvm.testing from tvm import te diff --git a/gallery/how_to/work_with_schedules/schedule_primitives.py b/gallery/how_to/work_with_schedules/schedule_primitives.py index 65fdeda57c3be..af67ed1527a0c 100644 --- a/gallery/how_to/work_with_schedules/schedule_primitives.py +++ b/gallery/how_to/work_with_schedules/schedule_primitives.py @@ -28,6 +28,12 @@ """ from __future__ import absolute_import, print_function + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import numpy as np diff --git a/gallery/how_to/work_with_schedules/tedd.py b/gallery/how_to/work_with_schedules/tedd.py index 34ad43c220da1..7cb24f433587e 100644 --- a/gallery/how_to/work_with_schedules/tedd.py +++ b/gallery/how_to/work_with_schedules/tedd.py @@ -37,6 +37,12 @@ how to use TEDD and how to interpret the rendered graphs. """ + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te from tvm import topi diff --git a/gallery/how_to/work_with_schedules/tensorize.py b/gallery/how_to/work_with_schedules/tensorize.py index 40e68074adc87..45eaf349f37b0 100644 --- a/gallery/how_to/work_with_schedules/tensorize.py +++ b/gallery/how_to/work_with_schedules/tensorize.py @@ -34,6 +34,12 @@ """ from __future__ import absolute_import, print_function + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import tvm.testing diff --git a/gallery/how_to/work_with_schedules/tuple_inputs.py b/gallery/how_to/work_with_schedules/tuple_inputs.py index 73db7b90a7d62..86ec8b2d196b7 100644 --- a/gallery/how_to/work_with_schedules/tuple_inputs.py +++ b/gallery/how_to/work_with_schedules/tuple_inputs.py @@ -27,6 +27,12 @@ """ from __future__ import absolute_import, print_function + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore import tvm from tvm import te import numpy as np diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py index b9f89f6723c9b..279987f00d819 100644 --- a/gallery/tutorial/auto_scheduler_matmul_x86.py +++ b/gallery/tutorial/auto_scheduler_matmul_x86.py @@ -38,6 +38,12 @@ __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import os import numpy as np diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py index b84a6193cde6e..ebdbacb221534 100644 --- a/gallery/tutorial/autotvm_matmul_x86.py +++ b/gallery/tutorial/autotvm_matmul_x86.py @@ -45,6 +45,12 @@ :code:`if __name__ == "__main__":` block. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # Install dependencies # -------------------- diff --git a/gallery/tutorial/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py index 4e5714a6db328..b7dfbe28f462e 100644 --- a/gallery/tutorial/autotvm_relay_x86.py +++ b/gallery/tutorial/autotvm_relay_x86.py @@ -42,6 +42,12 @@ how to use them through the Python API. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # TVM is a deep learning compiler framework, with a number of different modules # available for working with deep learning models and operators. In this diff --git a/gallery/tutorial/cross_compilation_and_rpc.py b/gallery/tutorial/cross_compilation_and_rpc.py index 25208369f74dd..3f74899f7b1de 100644 --- a/gallery/tutorial/cross_compilation_and_rpc.py +++ b/gallery/tutorial/cross_compilation_and_rpc.py @@ -31,6 +31,12 @@ and the Firefly-RK3399 for an OpenCL example. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # Build TVM Runtime on Device # --------------------------- diff --git a/gallery/tutorial/install.py b/gallery/tutorial/install.py index 0eb3ccc94c064..a499b037940cd 100644 --- a/gallery/tutorial/install.py +++ b/gallery/tutorial/install.py @@ -28,6 +28,12 @@ * Installing from third-party binary package. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # Installing From Source # ---------------------- diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py index 17fa3ff370e54..e10a74c849c03 100644 --- a/gallery/tutorial/intro_topi.py +++ b/gallery/tutorial/intro_topi.py @@ -26,6 +26,12 @@ In this tutorial, we will see how TOPI can save us from writing boilerplate code in TVM. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import tvm import tvm.testing from tvm import te diff --git a/gallery/tutorial/introduction.py b/gallery/tutorial/introduction.py index 5fe4b4e5f775d..908a8e52c751f 100644 --- a/gallery/tutorial/introduction.py +++ b/gallery/tutorial/introduction.py @@ -45,6 +45,12 @@ #. :doc:`Compiling Deep Learning Models for GPUs ` """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # An Overview of TVM and Model Optimization # ========================================= diff --git a/gallery/tutorial/relay_quick_start.py b/gallery/tutorial/relay_quick_start.py index fd7f5aa9d7563..8910817c21177 100644 --- a/gallery/tutorial/relay_quick_start.py +++ b/gallery/tutorial/relay_quick_start.py @@ -26,6 +26,12 @@ Notice that you need to build TVM with cuda and llvm enabled. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ###################################################################### # Overview for Supported Hardware Backend of TVM # ---------------------------------------------- diff --git a/gallery/tutorial/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py index 25ea4e8a55ee5..11186d2f1458d 100644 --- a/gallery/tutorial/tensor_expr_get_started.py +++ b/gallery/tutorial/tensor_expr_get_started.py @@ -39,6 +39,12 @@ features of TVM. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # Example 1: Writing and Scheduling Vector Addition in TE for CPU # --------------------------------------------------------------- diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py index 11edc7ae9f3b9..a62fa3979393e 100644 --- a/gallery/tutorial/tensor_ir_blitz_course.py +++ b/gallery/tutorial/tensor_ir_blitz_course.py @@ -29,6 +29,12 @@ """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + import tvm from tvm.ir.module import IRModule from tvm.script import tir as T diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py index 48e3703beb75d..ad5b37190c103 100644 --- a/gallery/tutorial/tvmc_command_line_driver.py +++ b/gallery/tutorial/tvmc_command_line_driver.py @@ -41,6 +41,12 @@ capabilities, and set the stage for understanding how TVM works. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # Using TVMC # ---------- diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py index 6efc565f0a391..28b0a97450461 100644 --- a/gallery/tutorial/tvmc_python.py +++ b/gallery/tutorial/tvmc_python.py @@ -36,6 +36,12 @@ Let's start editing the python file in your favorite text editor. """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + ################################################################################ # Step 0: Imports # ~~~~~~~~~~~~~~~ diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 5a6ded9bcb709..96275e2af66f4 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -76,6 +76,7 @@ def test_something(): import sys import time +from pathlib import Path from typing import Optional, Callable, Union, List import pytest @@ -93,6 +94,7 @@ def test_something(): SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"} +IS_IN_CI = os.getenv("CI", "") == "true" skip_if_wheel_test = pytest.mark.skipif( os.getenv("WHEEL_TEST") is not None, reason="Test not supported in wheel." @@ -1613,6 +1615,51 @@ def is_ampere_or_newer(): return major >= 8 +def install_request_hook(depth: int) -> None: + """Add a wrapper around urllib.request for CI tests""" + if not IS_IN_CI: + return + + # https://sphinx-gallery.github.io/stable/faq.html#why-is-file-not-defined-what-can-i-use + base = None + msg = "" + try: + base = __file__ + msg += f"found file {__file__}\n" + except NameError: + msg += f"no file\n" + + if base is None: + hook_script_dir = Path.cwd().resolve() + msg += "used path.cwd()\n" + else: + hook_script_dir = Path(base).resolve().parent + msg += "used base()\n" + + msg += f"using depth {depth}\n" + if depth <= 0: + raise ValueError(f"depth less than 1 not supported, found: {depth}") + + # Go up the parent directories + while depth > 0: + msg += f"[depth={depth}] dir={hook_script_dir}\n" + hook_script_dir = hook_script_dir.parent + depth -= 1 + + # Ensure the specified dir is valid + hook_script_dir = hook_script_dir / "tests" / "scripts" / "request_hook" + if not hook_script_dir.exists(): + raise RuntimeError(f"Directory {hook_script_dir} does not exist:\n{msg}") + + # Import the hook and start it up (it's not included here directly to avoid + # keeping a database of URLs inside the tvm Python package + sys.path.append(str(hook_script_dir)) + # This import is intentionally delayed since it should only happen in CI + import request_hook # pylint: disable=import-outside-toplevel + + request_hook.init() + + def main(): test_file = inspect.getsourcefile(sys._getframe(1)) sys.exit(pytest.main([test_file] + sys.argv[1:])) diff --git a/tests/lint/check_request_hook.py b/tests/lint/check_request_hook.py new file mode 100644 index 0000000000000..6e5c523d1187d --- /dev/null +++ b/tests/lint/check_request_hook.py @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import fnmatch +import re +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +EXPECTED = """ +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore +""".rstrip() +IGNORE_PATTERNS = ["*/micro_tvmc.py", "*/micro_train.py"] + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check that all tutorials/docs override urllib.request.Request" + ) + parser.add_argument( + "--fix", action="store_true", help="Insert expected code into erroring files" + ) + args = parser.parse_args() + + gallery_files = (REPO_ROOT / "gallery").glob("**/*.py") + + errors = [] + for file in gallery_files: + skip = False + for ignored_file in IGNORE_PATTERNS: + if fnmatch.fnmatch(str(file), ignored_file): + skip = True + break + if skip: + continue + + with open(file) as f: + content = f.read() + + if EXPECTED not in content: + errors.append(file) + + if args.fix: + for error in errors: + with open(error) as f: + content = f.read() + + if "from __future__" in content: + # Place after the last __future__ import + new_content = re.sub( + r"((?:from __future__.*?\n)+)", r"\1\n" + EXPECTED, content, flags=re.MULTILINE + ) + else: + # Place after the module doc comment + new_content = re.sub( + r"(\"\"\"(?:.*\n)+\"\"\")", r"\1\n" + EXPECTED, content, flags=re.MULTILINE + ) + + with open(error, "w") as f: + f.write(new_content) + else: + # Don't fix, just check and print an error message + if len(errors) > 0: + print( + f"These {len(errors)} files did not contain the expected text to " + "override urllib.request.Request.\n" + "You can run 'python3 tests/lint/check_request_hook.py --fix' to " + "automatically fix these errors:\n" + f"{EXPECTED}\n\nFiles:\n" + "\n".join([str(error_path) for error_path in errors]) + ) + exit(1) + else: + print("All files successfully override urllib.request.Request") + exit(0) diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py new file mode 100644 index 0000000000000..f24f76869e7de --- /dev/null +++ b/tests/scripts/request_hook/request_hook.py @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import urllib.request +import logging + +LOGGER = None + + +# To update this list, run the workflow with the URL to download and the SHA512 of the file +BASE = "https://tvm-ci-resources.s3.us-west-2.amazonaws.com" +URL_MAP = { + "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip": f"{BASE}/oneflow/resnet18.zip", + "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch", + "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg", + "https://people.linaro.org/~tom.gall/sine_model.tflite": f"{BASE}/sine_model.tflite", + "https://pjreddie.com/media/files/yolov3-tiny.weights?raw=true": f"{BASE}/yolov3-tiny.weights", + "https://pjreddie.com/media/files/yolov3.weights": f"{BASE}/yolov3.weights", + "http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec": f"{BASE}/mxnet-val_256_q90.rec", + "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz": f"{BASE}/tf-mobilenet_v1_1.0_224.tgz", + "http://images.cocodataset.org/zips/val2017.zip": f"{BASE}/cocodataset-val2017.zip", + "https://bj.bcebos.com/x2paddle/models/paddle_resnet50.tar": f"{BASE}/bcebos-paddle_resnet50.tar", + "https://data.deepai.org/stanfordcars.zip": f"{BASE}/deepai-stanfordcars.zip", +} + + +class TvmRequestHook(urllib.request.Request): + def __init__(self, url, *args, **kwargs): + LOGGER.info(f"Caught access to {url}") + if url in URL_MAP: + new_url = URL_MAP[url] + LOGGER.info(f"Mapped URL {url} to {new_url}") + else: + new_url = url + super().__init__(new_url, *args, **kwargs) + + +def init(): + global LOGGER + urllib.request.Request = TvmRequestHook + LOGGER = logging.getLogger("tvm_request_hook") + LOGGER.setLevel(logging.DEBUG) + fh = logging.FileHandler("redirected_urls.log") + fh.setLevel(logging.DEBUG) + LOGGER.addHandler(fh) diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh index a05f7ca36bcca..84f46523370e6 100755 --- a/tests/scripts/task_lint.sh +++ b/tests/scripts/task_lint.sh @@ -40,6 +40,9 @@ function shard1 { echo "Checking CMake <-> LibInfo options mirroring" python3 tests/lint/check_cmake_options.py + echo "Checking that all sphinx-gallery docs override urllib.request.Request" + python3 tests/lint/check_request_hook.py + echo "black check..." tests/lint/git-black.sh From 265030eea4cf0447b5744b759d763158373167a2 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 30 Jun 2022 18:11:21 +0100 Subject: [PATCH 024/111] [ETHOSN] Use partition_for_ function when running tests (#11945) Keeps the tests in parity with the partition_for_ function so any changes are reflected in the tests. Change-Id: I580cc381d382c777484e8251c609867a69da8e67 --- tests/python/contrib/test_ethosn/infrastructure.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index a4c20908151bc..e1bbcf8ad3a24 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -28,7 +28,7 @@ import os from . import _infrastructure -from tvm.relay.op.contrib import get_pattern_table +from tvm.relay.op.contrib import partition_for_ethosn def get_real_image(im_height, im_width): @@ -155,17 +155,7 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1): ): with tvm.target.Target("llvm"): if npu: - f = relay.build_module.bind_params_by_name(mod["main"], params) - mod = tvm.IRModule() - mod["main"] = f - pattern = get_pattern_table("ethos-n") - mod = relay.transform.InferType()(mod) - mod = relay.transform.MergeComposite(pattern)(mod) - mod = relay.transform.AnnotateTarget("ethos-n")(mod) - mod = relay.transform.InferType()(mod) - mod = relay.transform.MergeCompilerRegions()(mod) - mod = relay.transform.InferType()(mod) - mod = relay.transform.PartitionGraph()(mod) + mod = partition_for_ethosn(mod, params, variant="n78") host_op_count = get_host_op_count(mod) assert ( host_op_count == expected_host_ops From 985680ee1ae77ebe51f373df64063f8372e6cb6e Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Thu, 30 Jun 2022 10:27:21 -0700 Subject: [PATCH 025/111] [BYOC] Handle constants in IRModule-at-a-time external codegen (#11770) I tried to do to the TensorRT integration what #11631 did to the CUTLASS integration, viz: - Make sure all compilation options are passed in Target instances. This helps Collage. - Use a custom pass invoked via RelayToTIRTargetHooks instead of the relay.ext.$toolchain mechanism. This helps use decouple external codegen from lowering. This PR collects the prep for that change: - TensorRT uses the JSONSerializer visitor to encode each partition function. Previously, when the visitor encountered a Constant it simply generated and recorded a name for the constant. Then, completely separately, and via a callback in TECompiler, the function is visited again in the same order and with the same name generation convention by a ConstantUpdater to actually collect the bindings, which are then encoded into a ConstLoaderModule to be made available at runtime. However if all TensorRT compilation is to be done by a stand-alone pass there's no TECompiler callback hackery available. So I've added a "const_name_to_ndarray" attribute to the IRModule of type Map so that named constants can be accumulated throughout compilation by any pass which needs to do so. Then the Graph, AOT and VM executors are all updated to merge those constants into the final runtime artifact (Compare with "Constants", the equivalent attribute for extracting TIR AllocateConsts.) - The TensorRT tests use the create_executor interface but it wasn't quite ready for the new more general form of passing list-of-targets. - I want TensorRT compilation to work out of the box without the need for any special targets if all the default options should apply. Go back and make the CUTLASS integration I did follow the same convention. - To test this I also switched the 'demo' "ccompiler" external codegen target to IRModule-at-a-time style. This means we can test most of external codegen machinery in one place without depending on any target which may not be enabled in CI (eg TensorRT): - Target instances are plumbed correctly so compile-time options are available. - External modules are conveyed to the final export library. - Constant bindings are conveyed to the metadata module. --- cmake/modules/contrib/CODEGENC.cmake | 2 +- include/tvm/ir/module.h | 30 +- include/tvm/tir/stmt.h | 6 +- python/tvm/relay/backend/interpreter.py | 2 +- python/tvm/relay/backend/vm.py | 5 +- python/tvm/relay/build_module.py | 41 ++- python/tvm/relay/transform/transform.py | 4 +- python/tvm/tir/stmt.py | 2 +- src/relay/backend/aot_executor_codegen.cc | 39 ++- src/relay/backend/build_module.cc | 12 +- .../contrib/arm_compute_lib/codegen.cc | 9 +- src/relay/backend/contrib/bnns/codegen.cc | 8 +- .../backend/contrib/codegen_c/codegen.cc | 281 ++++++++++++------ .../backend/contrib/codegen_c/codegen_c.h | 13 +- src/relay/backend/contrib/codegen_c/target.cc | 43 +++ .../contrib/codegen_json/codegen_json.h | 46 ++- src/relay/backend/contrib/cutlass/codegen.cc | 34 ++- src/relay/backend/contrib/dnnl/codegen.cc | 8 +- .../contrib/example_target_hooks/target.cc | 1 - src/relay/backend/contrib/tensorrt/codegen.cc | 9 +- .../backend/contrib/verilator/codegen.cc | 9 +- src/relay/backend/graph_executor_codegen.cc | 39 +-- src/relay/backend/te_compiler.cc | 4 +- src/relay/backend/utils.h | 8 +- src/relay/backend/vm/compiler.cc | 28 +- .../transforms/compiler_function_utils.cc | 34 ++- .../transforms/compiler_function_utils.h | 13 +- src/relay/transforms/target_hooks.cc | 7 +- src/target/metadata_module.cc | 2 + src/tir/transforms/extract_constants.cc | 6 +- tests/python/relay/test_external_codegen.py | 40 +-- .../transform/test_compiler_function_utils.py | 40 +++ .../python/unittest/test_custom_datatypes.py | 3 +- .../test_tir_transform_extract_constants.py | 5 +- 34 files changed, 571 insertions(+), 262 deletions(-) create mode 100644 src/relay/backend/contrib/codegen_c/target.cc diff --git a/cmake/modules/contrib/CODEGENC.cmake b/cmake/modules/contrib/CODEGENC.cmake index 275c32514ebaf..412fa3e8ffc51 100644 --- a/cmake/modules/contrib/CODEGENC.cmake +++ b/cmake/modules/contrib/CODEGENC.cmake @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -tvm_file_glob(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/codegen.cc) +tvm_file_glob(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/*.cc) list(APPEND COMPILER_SRCS ${CSOURCE_RELAY_CONTRIB_SRC}) diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h index b78f16a84f02c..f73f2230df4d7 100644 --- a/include/tvm/ir/module.h +++ b/include/tvm/ir/module.h @@ -479,8 +479,10 @@ TVM_DLL String AsText(const ObjectRef& node, bool show_meta_data = true, namespace attr { +// Following are attributes for IRModule only. + /*! - * \brief Executor targetted by the module + * \brief Executor targeted by the module * * Type: Executor * @@ -516,9 +518,31 @@ constexpr const char* kWorkspaceMemoryPools = "workspace_memory_pools"; constexpr const char* kConstantMemoryPools = "constant_memory_pools"; /* - * \brief Module attribute for tir constants + * \brief All the runtime::NDArrays extracted from PrimFunc tir::AllocateConst nodes. The + * node will record the index into this array. See also kConstNameToConstant below, which is + * the analog for Realy Functions. + * + * Type: Array + */ +constexpr const char* kConstants = "constants"; + +/*! + * \brief All the runtime::Modules accumulated during compilation by external codegen. These + * modules must be either directly linked or captured in the final compilation artifact. + * + * Type: Array + */ +constexpr const char* kExternalMods = "external_mods"; + +/*! + * \brief All the named runtime::NDArrays accumulated during compilation by external codegen. + * Generally the associated runtime::Module will indicate it requires bindings for these names, + * and during module initialization these bindings will be recovered from a ConstLoaderModule. + * See also kConstantsArray above, which is the analog for PrimFuncs. + * + * Type: Map */ -constexpr const char* kConstantsArray = "Constants"; +constexpr const char* kConstNameToConstant = "const_name_to_constant"; } // namespace attr } // namespace tvm diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index ac35c0b41e0ec..ddc97549fc70c 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -599,9 +599,9 @@ class AllocateConstNode : public StmtNode { /*! \brief The optional data associated to the constant. */ Optional data; - /*! \brief If the PrimFunc containing the Stmt is added to IRModule, - this is an optional index to indicate the index within - "Constants" attribute, that is a Array of IRModule. + /*! + * \brief If the PrimFunc containing the Stmt is added to IRModule, this is an optional index + * to indicate the index within "constants" attribute, that is a Array of IRModule. */ Optional irmod_storage_idx; /*! \brief The type of the buffer. */ diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py index 819e5eda41f58..020736beb5c43 100644 --- a/python/tvm/relay/backend/interpreter.py +++ b/python/tvm/relay/backend/interpreter.py @@ -195,7 +195,7 @@ class Interpreter(Executor): The runtime device to run the code on. target : tvm.Target - The target option to build the function. + The target option to build the function. Only homogeneous execution is supported. CAUTION: Despite the API the module is prepared upon each call to evaluate rather than once in create_executor. diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py index d4a82cd8d4279..bc11d43cb0ca5 100644 --- a/python/tvm/relay/backend/vm.py +++ b/python/tvm/relay/backend/vm.py @@ -198,8 +198,9 @@ class VMExecutor(Executor): device : :py:class:`~tvm.runtime.Device` The runtime device to run the code on. - target : :py:class:`Target` - The target option to build the function. + target : any multi-target like object, see Target.canon_multi_target + For homogeneous compilation, the unique build target. + For heterogeneous compilation, a dictionary or list of possible build targets. """ def __init__(self, mod, device, target): diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 1353d8c5f595c..32ad6c70794c7 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -570,8 +570,9 @@ class GraphExecutor(_interpreter.Executor): device : :py:class:`Device` The runtime device to run the code on. - target : :py:class:`Target` - The target option to build the function. + target : any multi-target like object, see Target.canon_multi_target + For homogeneous compilation, the unique build target. + For heterogeneous compilation, a dictionary or list of possible build targets. """ def __init__(self, mod, device, target): @@ -630,8 +631,9 @@ class AotExecutor(_interpreter.Executor): device : :py:class:`Device` The runtime device to run the code on. - target : :py:class:`Target` - The target option to build the function. + target : any multi-target like object, see Target.canon_multi_target + For homogeneous compilation, the unique build target. + For heterogeneous compilation, a dictionary or list of possible build targets. """ def __init__(self, mod, device, target): @@ -639,7 +641,6 @@ def __init__(self, mod, device, target): self.mod = mod self.device = device self.target = target - assert target.attrs.get("executor", "graph") == "aot" def _make_executor(self, expr=None): if expr: @@ -719,8 +720,11 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N device : :py:class:`Device` The device to execute the code. - target : :py:class:`tvm.Target` - The corresponding context + target : any multi-target like object, see Target.canon_multi_target + For homogeneous compilation, the unique build target. + For heterogeneous compilation, a dictionary or list of possible build targets. + CAUTION: Though this API allows multiple targets, it does not allow multiple devices, so + heterogenous compilation is not yet supported. params : dict of str to NDArray Input parameters to the graph that do not change @@ -730,24 +734,31 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N ------- executor : :py:class:`~tvm.relay.backend.interpreter.Executor` """ + raw_targets = Target.canon_multi_target(target) if mod is None: mod = IRModule() if device is not None: - assert device.device_type == _nd.device(str(target), 0).device_type + assert device.device_type == raw_targets[0].kind.device_type else: - device = _nd.device(str(target), 0) + # Derive the default device from the first target. + device = _nd.device(raw_targets[0].kind.device_type, 0) if params is not None: mod = IRModule.from_expr(bind_params_by_name(mod["main"], params)) - if isinstance(target, str): - target = Target(target) + assert "executor" not in raw_targets[0].attrs or raw_targets[0].attrs["executor"] == kind + if kind == "debug": - return _interpreter.Interpreter(mod, device, target) + assert len(raw_targets) == 1, "The interpreter currently only supports a single target" + return _interpreter.Interpreter(mod, device, raw_targets[0]) if kind == "graph": - return GraphExecutor(mod, device, target) + return GraphExecutor(mod, device, raw_targets) if kind == "vm": - return VMExecutor(mod, device, target) + return VMExecutor(mod, device, raw_targets) if kind == "aot": - return AotExecutor(mod, device, target) + # The AOT requires the executor as a target attribute. + # (The compilation paths for the other executors currently do not always provide this + # attribute, hence the above generic assert is more forgiving). + assert "executor" in raw_targets[0].attrs + return AotExecutor(mod, device, raw_targets) raise RuntimeError("unknown execution strategy: {0}".format(kind)) diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index c931289d40c60..d7979a757171b 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -1386,7 +1386,7 @@ def OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter=""): Parameters ---------- compiler_filter : String - If non-empty, the 'compiler' attribute to filter on. + If non-empty, the "Compiler" attribute to filter on. Returns ------- @@ -1412,7 +1412,7 @@ def MarkCompilerFunctionsAsExtern(compiler_filter=""): Parameters ---------- compiler_filter : String - If non-empty, the 'compiler' attribute to filter on. + If non-empty, the "Compiler" attribute to filter on. Returns ------- diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py index 301bfa73c8182..063439e068a49 100644 --- a/python/tvm/tir/stmt.py +++ b/python/tvm/tir/stmt.py @@ -358,7 +358,7 @@ class AllocateConst(Stmt): data_or_idx : Union[NDArray, int] If an NDArray, this is the const data associated with the constant. If an integer, this is the index into the - "Constants" attribute of the `IRModule` that contains the + "constants" attribute of the `IRModule` that contains the `AllocateConst`. body : Stmt diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 5020e79714b28..ae60970b78af3 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -1167,11 +1167,19 @@ class AOTExecutorCodegen : public MixedModeVisitor { // because the packed calls arguments are not wrapped in TVMValues. To make this happen we need // to run the LegalizePackedCalls pass. LoweredOutput ret; - ret.params = std::unordered_map>(); - for (auto param : params_) { - ret.params.emplace(std::make_pair( - param.first, - std::make_pair(static_cast(param_storage_ids_[param.first]), param.second))); + + // Collect any constants extracted by external codegen. + ret.params = std::unordered_map(); + Map const_name_to_constant = + lowered_mod->GetAttr>(tvm::attr::kConstNameToConstant) + .value_or({}); + for (const auto& kv : const_name_to_constant) { + ICHECK(ret.params.emplace(kv.first, kv.second).second); + } + + // Collect any constants extracted during lowering. + for (const auto& kv : params_) { + ICHECK(ret.params.emplace(kv.first, kv.second).second); } // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main @@ -1212,9 +1220,9 @@ class AOTExecutorCodegen : public MixedModeVisitor { lowered_mod = pack_calls(lowered_mod); } - Optional> external_modules = - lowered_mod->GetAttr>("external_mods"); - ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point."; + // Collect any runtime modules generated by external codegen. + ret.external_mods = + lowered_mod->GetAttr>(tvm::attr::kExternalMods).value_or({}); // This is the point where we separate the functions in the module by target VLOG(1) << "lowered module:" << std::endl << PrettyPrint(lowered_mod); @@ -1227,8 +1235,6 @@ class AOTExecutorCodegen : public MixedModeVisitor { << PrettyPrint(kv.second); } - ret.external_mods = external_modules.value(); - // Extract USMP metadata to pass onto metadata sources Map pool_var_info; std::vector pool_vars; @@ -1316,11 +1322,6 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode { String key = args[0]; *rv = get_param_by_name(key); }); - } else if (name == "get_param_id") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - String key = args[0]; - *rv = get_param_id(key); - }); } else if (name == "get_irmodule") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_irmodule(); }); @@ -1362,17 +1363,11 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode { runtime::NDArray get_param_by_name(String key) { auto it = this->output_.params.find(key); CHECK(it != this->output_.params.end()) << "no such parameter " << key; - return (*it).second.second; + return (*it).second; } Array get_external_modules() { return output_.external_mods; } - int get_param_id(String key) { - auto it = this->output_.params.find(key); - CHECK(it != this->output_.params.end()) << "no such parameter " << key; - return (*it).second.first; - } - Map get_irmodule() { return this->output_.lowered_funcs; } std::shared_ptr codegen_; diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index 628dee0844ecb..9a68b567305d1 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -86,17 +86,6 @@ struct ExecutorCodegen { return ret; } - std::unordered_map GetParamIds() { - std::unordered_map ret; - auto names = CallFunc>("list_params_name", nullptr); - for (const auto& expr : names) { - // Implicit cast from runtime::String to std::string - std::string key = expr; - ret[key] = CallFunc("get_param_id", key); - } - return ret; - } - Array GetExternalModules() { return CallFunc>("get_external_modules", nullptr); } @@ -478,6 +467,7 @@ class RelayBuildModule : public runtime::ModuleNode { for (size_t i = 0; i < variables.size(); i++) { auto it = ret_.params.find(variables[i].operator std::string()); if (it != ret_.params.end()) { + VLOG(1) << "constant '" << variables[i] << "' has been captured in external module"; ret_.params.erase(it); } } diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc index 842ede3bf20b8..81a5b5bbd9d8c 100644 --- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc +++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc @@ -392,10 +392,15 @@ runtime::Module ACLCompiler(const ObjectRef& ref) { ACLJSONSerializer serializer(func_name, func); serializer.serialize(); std::string graph_json = serializer.GetJSON(); - auto param_names = serializer.GetParams(); + + // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes + // a callback which calls backend::UpdateConstants to capture the map before the function + // 'disappears' into lowered form, on the assumption the visit order and thus constant + // names match those generated by the JSONSerializer. + const auto* pf = runtime::Registry::Get("runtime.arm_compute_lib_runtime_create"); ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; - runtime::Module lib = (*pf)(func_name, graph_json, param_names); + runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names()); return lib; } diff --git a/src/relay/backend/contrib/bnns/codegen.cc b/src/relay/backend/contrib/bnns/codegen.cc index 72c32fb5b19ee..3791773ad67d6 100644 --- a/src/relay/backend/contrib/bnns/codegen.cc +++ b/src/relay/backend/contrib/bnns/codegen.cc @@ -136,11 +136,15 @@ runtime::Module BNNSCompiler(const ObjectRef& ref) { BNNSJSONSerializer serializer(func_name, func); serializer.serialize(); std::string graph_json = serializer.GetJSON(); - auto params = serializer.GetParams(); + + // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes + // a callback which calls backend::UpdateConstants to capture the map before the function + // 'disappears' into lowered form, on the assumption the visit order and thus constant + // names match those generated by the JSONSerializer. const auto* pf = runtime::Registry::Get("runtime.BNNSJSONRuntimeCreate"); ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; - auto mod = (*pf)(func_name, graph_json, params); + auto mod = (*pf)(func_name, graph_json, serializer.const_names()); return mod; } diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc index fd1c39bb92830..ee8724fe92fe3 100644 --- a/src/relay/backend/contrib/codegen_c/codegen.cc +++ b/src/relay/backend/contrib/codegen_c/codegen.cc @@ -16,17 +16,17 @@ * specific language governing permissions and limitations * under the License. */ -#include + #include #include #include #include #include -#include #include #include +#include "../../../transforms/compiler_function_utils.h" #include "../../utils.h" #include "codegen_c.h" @@ -34,30 +34,62 @@ namespace tvm { namespace relay { namespace contrib { -using namespace backend; +/*! \brief Return the "ccompiler" Target instance to use to guide compilation. */ +Target GetCCompilerTarget() { + Target target = Target::Current(/*allow_not_defined=*/true); + if (!target.defined() || target->kind->name != "ccompiler") { + // Use the default compilation options if no specific "ccompiler" target was given + // in the overall targets list. In that case target_hooks.cc will invoke the custom pass + // without pushing any target instance onto the implicit target stack. + target = Target("ccompiler"); + } + return target; +} /*! - * \brief An example codegen that is only used for quick prototyping and testing - * purpose. Only several binary options are covered. Users - * may need to extend them to cover more operators. + * \brief Emits C/C++ code for a single function. + * + * For testing and demonstration only, only a few binary operators are supported. */ -class CodegenC : public MemoizedExprTranslator>, public CodegenCBase { +class CodegenC : public backend::MemoizedExprTranslator>, public CodegenCBase { public: - explicit CodegenC(const std::string& id) { this->ext_func_id_ = id; } + CodegenC(std::unordered_map* const_name_to_constant, + Array* const_names, bool* needs_extra_headers, std::string ext_func_id) + : const_name_to_constant_(const_name_to_constant), + const_names_(const_names), + needs_extra_headers_(needs_extra_headers), + ext_func_id_(std::move(ext_func_id)) {} - std::vector VisitExprDefault_(const Object* op) final { + /*! + * \brief Emit the source code that invokes C compiler compatible wrappers. + * + * \return The emitted code. + */ + std::string JIT(const std::vector& out) override { + if (!ext_func_args_.empty()) { + *needs_extra_headers_ = true; + } + // Write function macros + for (auto decl : func_decl_) { + code_stream_ << decl << "\n"; + } + return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body_, const_array_name_, out); + } + + private: + std::vector VisitExprDefault_(const Object* op) override { LOG(FATAL) << "C codegen doesn't support: " << op->GetTypeKey(); return {}; } - std::vector VisitExpr_(const VarNode* node) final { + std::vector VisitExpr_(const VarNode* node) override { ext_func_args_.push_back(GetRef(node)); Output output; output.name = node->name_hint(); return {output}; } - std::vector VisitExpr_(const TupleNode* node) final { + std::vector VisitExpr_(const TupleNode* node) override { std::vector outs; for (auto field : node->fields) { auto res = VisitExpr(field); @@ -67,7 +99,7 @@ class CodegenC : public MemoizedExprTranslator>, public Code return outs; } - std::vector VisitExpr_(const TupleGetItemNode* op) final { + std::vector VisitExpr_(const TupleGetItemNode* op) override { auto res = VisitExpr(op->tuple); ICHECK_GT(res.size(), static_cast(op->index)); @@ -76,19 +108,21 @@ class CodegenC : public MemoizedExprTranslator>, public Code return {res[op->index]}; } - std::vector VisitExpr_(const ConstantNode* cn) final { + std::vector VisitExpr_(const ConstantNode* cn) override { std::ostringstream decl_stream; std::ostringstream buf_stream; Output output; // Get const: static_cast(gcc_0_consts[0]->data) - output.name = CreateDataReference(ext_func_id_, const_idx_); + size_t const_id = const_name_to_constant_->size(); + output.name = CreateDataReference(ext_func_id_, const_id); const auto* type_node = cn->checked_type().as(); ICHECK(type_node); const auto& dtype = GetDtypeString(type_node); // Generate the global variable for needed ndarrays if (const_array_name_.empty()) { + *needs_extra_headers_ = true; const_array_name_ = CreateNDArrayPool(ext_func_id_); std::string checker = CreateInitChecker(ext_func_id_); ext_func_body_.insert(ext_func_body_.begin(), checker); @@ -97,14 +131,14 @@ class CodegenC : public MemoizedExprTranslator>, public Code ICHECK(dtype == "float" || dtype == "int") << "Only float and int are supported for now."; output.dtype = dtype; - std::string const_var_name = CreateConstVar(ext_func_id_, const_idx_); - const_vars_.push_back(const_var_name); - const_idx_++; + std::string const_var_name = CreateConstVar(ext_func_id_, const_id); + const_name_to_constant_->emplace(const_var_name, cn->data); + const_names_->push_back(const_var_name); return {output}; } - std::vector VisitExpr_(const CallNode* call) final { + std::vector VisitExpr_(const CallNode* call) override { std::ostringstream macro_stream; std::ostringstream decl_stream; std::ostringstream buf_stream; @@ -114,17 +148,17 @@ class CodegenC : public MemoizedExprTranslator>, public Code // Make function declaration macro_stream << "CSOURCE_BINARY_OP_" << call->args.size() << "D(" << func_name << ", "; - if (IsOp(call, "add")) { + if (backend::IsOp(call, "add")) { macro_stream << "+"; - } else if (IsOp(call, "subtract")) { + } else if (backend::IsOp(call, "subtract")) { macro_stream << "-"; - } else if (IsOp(call, "multiply")) { + } else if (backend::IsOp(call, "multiply")) { macro_stream << "*"; } else { LOG(FATAL) << "Unrecognized op"; } - auto in_shape = GetShape(call->args[0]->checked_type()); + auto in_shape = backend::GetShape(call->args[0]->checked_type()); for (size_t i = 0; i < in_shape.size(); ++i) { macro_stream << ", " << in_shape[i]; } @@ -152,7 +186,7 @@ class CodegenC : public MemoizedExprTranslator>, public Code } std::string out = "buf_" + std::to_string(buf_idx_++); - auto out_shape = GetShape(call->checked_type()); + auto out_shape = backend::GetShape(call->checked_type()); int out_size = 1; for (size_t i = 0; i < out_shape.size(); ++i) { out_size *= out_shape[i]; @@ -175,27 +209,21 @@ class CodegenC : public MemoizedExprTranslator>, public Code } /*! - * \brief Emit the source code that invokes C compiler compatible wrappers. - * - * \return The emitted code. + * \brief The accumulated constant name to constant mapping. Shared between all generated + * functions. */ - std::string JIT(const std::vector& out) { - // Write function macros - for (auto decl : func_decl_) { - code_stream_ << decl << "\n"; - } - return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body_, const_array_name_, out); - } - - private: - /*! \brief The function id that represents a C source function. */ - std::string ext_func_id_ = ""; - /*! \brief The index of a wrapped C function. */ + std::unordered_map* const_name_to_constant_; + /*! \brief The accumulated constant names, in the order they were generated. */ + Array* const_names_; + /*! \brief Set to true if the ndarray and packed function headers are required. */ + bool* needs_extra_headers_; + /*! \brief Name of the global function currently being compiled. */ + std::string ext_func_id_; + + /*! \brief The index of the next available wrapped C function. */ int func_idx = 0; - /*! \brief The index of allocated buffers. */ + /*! \brief The index of the next available allocated buffers. */ int buf_idx_ = 0; - /*! \brief The index of global constants. */ - int const_idx_ = 0; /*! \brief The arguments of a C compiler compatible function. */ Array ext_func_args_; /*! \brief The statements of a C compiler compatible function. */ @@ -206,53 +234,55 @@ class CodegenC : public MemoizedExprTranslator>, public Code std::vector func_decl_; /*! \brief The declaration statements of buffers. */ std::vector buf_decl_; - /*! \brief The variable name to constant mapping. */ - Array const_vars_; - - friend class CSourceCodegen; }; -class CSourceCodegen : public CSourceModuleCodegenBase { +/*! \brief Emits C/C++ code for a module. */ +class CodegenCModule { public: - std::tuple, String, String> GenCFunc(const Function& func) { - ICHECK(func.defined()) << "Input error: expect a Relay function."; - CodegenC builder(GetExtSymbol(func)); - auto out = builder.VisitExpr(func->body); - return std::make_tuple(builder.const_vars_, builder.ext_func_id_, builder.JIT(out)); - } + CodegenCModule(Target target, IRModule mod) : target_(std::move(target)), mod_(std::move(mod)) {} - runtime::Module CreateCSourceModule(const ObjectRef& ref) override { - ICHECK(ref->IsInstance()); - auto res = GenCFunc(Downcast(ref)); - Array variables = std::get<0>(res); - String func_name = std::get<1>(res); - - Optional opt_target = Target::Current(); - if (opt_target.defined() && opt_target.value()->kind->name == "ccompiler") { - Optional header = opt_target.value()->GetAttr("header"); - if (header.defined() && !header.value().empty()) { - code_stream_ << header.value().c_str() << "\n"; + runtime::Module CreateCSourceModule() { + for (const auto& kv : mod_->functions) { + if (const auto* function_node = GetCCompilerFunctionNode(kv.second)) { + GenCFunc(GetRef(function_node)); } } + return Finalize(); + } + + /*! \brief Returns the accumulated constant name to constant mapping. */ + const std::unordered_map& const_name_to_constant() const { + return const_name_to_constant_; + } + + private: + /*! \brief Emits the standard C/C++ header into \p os. */ + void EmitPreamble(std::ostringstream& os) { + // Custom header, if any. + Optional header = target_->GetAttr("header"); + if (header.defined() && !header.value().empty()) { + os << header.value().c_str() << "\n"; + } + + // Standard includes. + os << "#include \n"; + os << "#include \n"; + os << "#include \n"; + os << "#include \n"; + os << "#include \n"; - // Create headers - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - if (!variables.empty()) { + if (needs_extra_headers_) { // This segment would be generated in C++ because of the usage // of tvm::runtime::Array. This is not ideal, but this to demonstrate // constant copying process used packed imports in other external // codegen. Moreover, in microTVM we dont expect this part to be generated. - code_stream_ << "#ifdef __cplusplus\n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#endif\n"; + os << "#ifdef __cplusplus\n"; + os << "#include \n"; + os << "#include \n"; + os << "#endif\n"; } - // Append some common macro for operator definition. + // Define some macros to help operator implementations. const char* operator_macro = R"op_macro( #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE) \ void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ @@ -272,38 +302,97 @@ class CSourceCodegen : public CSourceModuleCodegenBase { } )op_macro"; - code_stream_ << operator_macro << "\n\n"; - code_stream_ << std::get<2>(res); - std::string code = code_stream_.str(); + os << operator_macro << "\n\n"; + } + + void GenCFunc(const Function& function) { + ICHECK(function.defined()) << "Input error: expect a Relay function."; + std::string ext_func_id = backend::GetExtSymbol(function); + CodegenC builder(&const_name_to_constant_, &const_names_, &needs_extra_headers_, ext_func_id); + std::vector out = builder.VisitExpr(function->body); + code_stream_ << builder.JIT(out); + func_names_.push_back(ext_func_id); + } + + /*! \brief Returns function if it is tagged with "Compiler=ccompiler". */ + static const FunctionNode* GetCCompilerFunctionNode(const Expr& expr) { + if (const auto* function_node = expr.as()) { + Optional opt_compiler = function_node->GetAttr(attr::kCompiler); + if (opt_compiler.defined() && opt_compiler.value() == "ccompiler") { + return function_node; + } + } + return nullptr; + } + + runtime::Module Finalize() { + std::ostringstream os; + EmitPreamble(os); + os << code_stream_.str(); + std::string code = os.str(); + + VLOG(1) << "CodegenCModule generated:" << std::endl << code; // Create a CSource module const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate"); ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module"; - return (*pf)(code, "c", Array{func_name}, variables); + return (*pf)(code, "c", func_names_, const_names_); } - private: + /*! \brief "ccompiler" Target with compilation options to use. */ + Target target_; + /*! \brief Module we are compiling. */ + IRModule mod_; + + /*! \brief True if we need to include the ndarray and packed function headers. */ + bool needs_extra_headers_ = false; + /*! \brief The accumulated constant name to constant mapping. */ + std::unordered_map const_name_to_constant_; + /*! \brief The accumulated constant names, in the order they were generated. */ + Array const_names_; + /*! \brief The accumulated function names. */ + Array func_names_; + /*! + * \brief The accumulated code stream containing all function definitions. + * (Does not include the preamble.) + */ std::ostringstream code_stream_; }; -/*! - * \brief The external compiler/codegen tool. It takes a Relay expression/module and - * compile it into a runtime module. - * - * The external codegen tool should have been registered similiarly to LLVM, - * CUDA, etc, under TVM, so the generated code could be packed in a runtime - * module. This module simplifies code serialization and invocation. - */ -runtime::Module CCompiler(const ObjectRef& ref) { - CSourceCodegen csource; - return csource.CreateCSourceModule(ref); -} +/*! \brief The actual translation pass. */ +transform::Pass CCompilerImpl() { + auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) { + VLOG(1) << "CCompilerImpl input:" << std::endl << PrettyPrint(mod); + Target target = GetCCompilerTarget(); + + // Emit the C/C++ code and package it as a CSourceModule. + CodegenCModule codegen(target, mod); + runtime::Module runtime_mod = codegen.CreateCSourceModule(); + + // Capture the new runtime module. + Array external_mods = + mod->GetAttr>(tvm::attr::kExternalMods).value_or({}); + external_mods.push_back(runtime_mod); + + // Capture the new constants. + Map const_name_to_constant = + mod->GetAttr>(tvm::attr::kConstNameToConstant).value_or({}); + for (const auto& kv : codegen.const_name_to_constant()) { + ICHECK_EQ(const_name_to_constant.count(kv.first), 0); + const_name_to_constant.Set(kv.first, kv.second); + } -TVM_REGISTER_GLOBAL("relay.ext.ccompiler").set_body_typed(CCompiler); + return WithAttrs(mod, {{tvm::attr::kExternalMods, external_mods}, + {tvm::attr::kConstNameToConstant, const_name_to_constant}}); + }; + return tvm::transform::CreateModulePass(pass_func, 0, "CCompilerImpl", {}); +} -TVM_REGISTER_TARGET_KIND("ccompiler", kDLCPU) - .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)) - .add_attr_option("header", String("")); // value is prepended to every output CModule +transform::Pass CCompilerPass() { + return transform::Sequential( + {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("ccompiler"), CCompilerImpl(), + transforms::MarkCompilerFunctionsAsExtern("ccompiler")}); +} } // namespace contrib } // namespace relay diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h index 49a5bca068d1b..1ee72c149f1a4 100644 --- a/src/relay/backend/contrib/codegen_c/codegen_c.h +++ b/src/relay/backend/contrib/codegen_c/codegen_c.h @@ -409,7 +409,7 @@ class CodegenCBase { * * \return The created reference */ - std::string CreateDataReference(const std::string& symbol, int const_id) const { + std::string CreateDataReference(const std::string& symbol, size_t const_id) const { return "(float*)(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)"; } @@ -421,8 +421,8 @@ class CodegenCBase { * * \return The created variable name */ - std::string CreateConstVar(const std::string& symbol, int const_id) const { - return symbol + "_const_" + std::to_string(const_id++); + std::string CreateConstVar(const std::string& symbol, size_t const_id) const { + return symbol + "_const_" + std::to_string(const_id); } /*! \brief The external function source code stream. */ @@ -433,7 +433,14 @@ class CodegenCBase { int indent_{0}; }; +/*! + * \brief A pass to translate all "Primitive" Relay functions with "Compiler=ccompiler" to + * a \p CSourceModule. + */ +transform::Pass CCompilerPass(); + } // namespace contrib } // namespace relay } // namespace tvm + #endif // TVM_RELAY_BACKEND_CONTRIB_CODEGEN_C_CODEGEN_C_H_ diff --git a/src/relay/backend/contrib/codegen_c/target.cc b/src/relay/backend/contrib/codegen_c/target.cc new file mode 100644 index 0000000000000..623057ac1762f --- /dev/null +++ b/src/relay/backend/contrib/codegen_c/target.cc @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "./codegen_c.h" + +namespace tvm { +namespace relay { +namespace contrib { + +/*! + * \brief This demonstration external codegen target emits C/C++ for compilation by the native c + * compiler on CPU. + * - Patterns: None, functions must be explicitly marked as "Primitive" and "Compiler=ccompiler". + * - Custom compiler: relay/backend/contrib/codegen_c/codegen.cc + */ +TVM_REGISTER_TARGET_KIND("ccompiler", kDLCPU) + .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)) + .set_attr(tvm::attr::kRelayToTIR, CCompilerPass()) + // Value is prepended to every output CModule. + .add_attr_option("header", String("")); + +} // namespace contrib +} // namespace relay +} // namespace tvm diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h index 4966f3f01c7d2..de6d0f74061b8 100644 --- a/src/relay/backend/contrib/codegen_json/codegen_json.h +++ b/src/relay/backend/contrib/codegen_json/codegen_json.h @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include "../../../../runtime/contrib/json/json_node.h" @@ -150,7 +152,8 @@ class JSONSerializer : public MemoizedExprTranslator(func_); @@ -162,8 +165,18 @@ class JSONSerializer : public MemoizedExprTranslatorbody); } - /*!\brief Return the required params. */ - Array GetParams() const { return params_; } + /*! + * \brief Returns the accumulated map from constant names to the NDArray they must be bound to + * at runtime. Also referred to a 'params' elsewhere in the code. + */ + const std::unordered_map& const_name_to_constant() const { + return const_name_to_constant_; + } + + /*! + * \brief Return the constant names in order they were encountered during translation. + */ + const Array& const_names() const { return const_names_; } /*!\brief Return the generated json. */ std::string GetJSON() { @@ -245,11 +258,15 @@ class JSONSerializer : public MemoizedExprTranslator(vn)]; } - std::vector VisitExpr_(const ConstantNode* cn) { - std::string name = symbol_ + "_const_" + std::to_string(params_.size()); - params_.push_back(name); - auto node = std::make_shared(name, "const" /* op_type_ */); - return AddNode(node, GetRef(cn)); + std::vector VisitExpr_(const ConstantNode* constant_node) { + std::string name = symbol_ + "_const_" + std::to_string(const_names_.size()); + VLOG(1) << "Will require parameter '" << name + << "' to be supplied by the ConstLoaderModule at runtime"; + ICHECK_EQ(const_name_to_constant_.count(name), 0); + const_name_to_constant_.emplace(name, constant_node->data); + const_names_.push_back(name); + auto node = std::make_shared(name, /*op_type=*/"const"); + return AddNode(node, GetRef(constant_node)); } std::vector VisitExpr_(const TupleNode* tn) { @@ -340,8 +357,17 @@ class JSONSerializer : public MemoizedExprTranslator nodes_; /*! \brief Output of the JSON graph. */ std::vector heads_; - /*! \brief The list of required constants. */ - Array params_; + /*! + * \brief A map from constant names to NDArrays for each Constant encountered during + * translation to JSON. The JSON will record only the constant name. The actual NDArray must + * be made available at runtime from a ConstLoaderModule. + */ + std::unordered_map const_name_to_constant_; + /*! + * \brief The domain of the above map, but in order the constants were encountered during + * translation. + */ + Array const_names_; }; } // namespace contrib diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc index 772007792ae62..de2934173b5ff 100644 --- a/src/relay/backend/contrib/cutlass/codegen.cc +++ b/src/relay/backend/contrib/cutlass/codegen.cc @@ -43,6 +43,18 @@ namespace cutlass { namespace { +/*! \brief Return the "cutlass" Target instance to use to guide compilation. */ +Target GetCutlassTarget() { + Target target = Target::Current(/*allow_not_defined=*/true); + if (!target.defined() || target->kind->name != "cutlass") { + // Use the default CUTLASS compilation options if no specific "cutlass" target was given + // in the overall targets list. In that case target_hooks.cc will invoke the custom pass + // without pushing any target instance onto the implicit target stack. + target = Target("cutlass"); + } + return target; +} + using Str2StrMap = std::unordered_map; static Str2StrMap dtype_map = {{"float16", "cutlass::half_t"}, @@ -563,7 +575,7 @@ class CodegenCutlass : public backend::MemoizedExprTranslatorExitScope(); code_stream_ << "}\n"; - this->GenerateBackendCFunc(ext_func_id_, ext_func_args_, const_array_name_, out, true); + this->GenerateBackendCFunc(ext_func_id_, ext_func_args_, /*const_arr_name=*/"", out, true); return code_stream_.str(); } @@ -769,7 +781,7 @@ class CodegenCutlass : public backend::MemoizedExprTranslator attrs_; /*! @@ -781,8 +793,6 @@ class CodegenCutlass : public backend::MemoizedExprTranslator ext_func_args_; /*! \brief Statement of the function that will be compiled using CUTLASS kernels. */ std::vector ext_func_body_; - /*! \brief The array declared to store the constant values. */ - std::string const_array_name_; /*! \brief The declaration of intermediate buffers. */ std::vector buf_decl_; }; // class CodegenCutlass @@ -863,14 +873,14 @@ class CutlassModuleCodegen { const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate"); ICHECK(pf != nullptr) << "Cannot find CSource module to create the external runtime module"; VLOG(1) << "Generated CUTLASS code:" << std::endl << code_stream_.str(); - return (*pf)(code_stream_.str(), "cu", func_names_, const_vars_); + return (*pf)(code_stream_.str(), "cu", func_names_, /*const_vars=*/Array()); } /*! * \brief Returns \p expr as function if it is a \p Function with "Compiler" attribute * value "cutlass". */ - const FunctionNode* GetCutlassFunctionNode(const Expr& expr) { + static const FunctionNode* GetCutlassFunctionNode(const Expr& expr) { if (const auto* function_node = expr.as()) { Optional opt_compiler = function_node->GetAttr(attr::kCompiler); if (opt_compiler.defined() && opt_compiler.value() == "cutlass") { @@ -886,8 +896,6 @@ class CutlassModuleCodegen { std::ostringstream code_stream_; /*! \brief The accumulated function names. */ Array func_names_; - /*! \brief The accumulated constant names. */ - Array const_vars_; }; // CutlassModuleCodegen /*! @@ -899,14 +907,12 @@ transform::Pass CompileForCutlassImpl() { VLOG(1) << "CompileForCutlass input:" << std::endl << PrettyPrint(mod); const auto* pf = runtime::Registry::Get("relay.ext.cutlass.compile_for_cutlass"); ICHECK(pf != nullptr) << "Cannot find compile_for_cutlass function"; - Optional opt_cutlass_target = Target::Current(); - ICHECK(opt_cutlass_target.defined()) << "Expecting Target::Current to be available"; - ICHECK_EQ(opt_cutlass_target.value()->kind->name, "cutlass"); - runtime::Module runtime_mod = (*pf)(mod, opt_cutlass_target.value()); + Target target = GetCutlassTarget(); + runtime::Module runtime_mod = (*pf)(mod, target); Array external_mods = - mod->GetAttr>("external_mods", Array()).value(); + mod->GetAttr>(tvm::attr::kExternalMods).value_or({}); external_mods.push_back(runtime_mod); - return WithAttr(mod, "external_mods", external_mods); + return WithAttr(mod, tvm::attr::kExternalMods, external_mods); }; return tvm::transform::CreateModulePass(pass_func, 0, "CompileForCutlass", {}); } diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc index f17cdafa76a5f..2f47c23a7cf9b 100644 --- a/src/relay/backend/contrib/dnnl/codegen.cc +++ b/src/relay/backend/contrib/dnnl/codegen.cc @@ -585,11 +585,15 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) { DNNLJSONSerializer serializer(func_name, func); serializer.serialize(); std::string graph_json = serializer.GetJSON(); - auto params = serializer.GetParams(); + + // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes + // a callback which calls backend::UpdateConstants to capture the map before the function + // 'disappears' into lowered form, on the assumption the visit order and thus constant + // names match those generated by the JSONSerializer. const auto* pf = runtime::Registry::Get("runtime.DNNLJSONRuntimeCreate"); ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; - auto mod = (*pf)(func_name, graph_json, params); + auto mod = (*pf)(func_name, graph_json, serializer.const_names()); return mod; #else DNNLModuleCodegen dnnl; diff --git a/src/relay/backend/contrib/example_target_hooks/target.cc b/src/relay/backend/contrib/example_target_hooks/target.cc index 19bfa8c682986..b01c23ed806a0 100644 --- a/src/relay/backend/contrib/example_target_hooks/target.cc +++ b/src/relay/backend/contrib/example_target_hooks/target.cc @@ -1,4 +1,3 @@ - /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index 149cc485c7528..e08cd240d4d1e 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -318,11 +318,16 @@ runtime::Module TensorRTCompiler(const ObjectRef& ref) { serializer.serialize(); std::string graph_json = serializer.GetJSON(); VLOG(1) << "TensorRT JSON:" << std::endl << graph_json; - auto param_names = serializer.GetParams(); + + // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes + // a callback which calls backend::UpdateConstants to capture the map before the function + // 'disappears' into lowered form, on the assumption the visit order and thus constant + // names match those generated by the JSONSerializer. + const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create"); ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function."; VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'"; - runtime::Module lib = (*pf)(func_name, graph_json, param_names); + runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names()); return lib; } diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc index 2c29896d1b0e7..2e6fb13263144 100644 --- a/src/relay/backend/contrib/verilator/codegen.cc +++ b/src/relay/backend/contrib/verilator/codegen.cc @@ -111,10 +111,15 @@ runtime::Module VerilatorBackend(const ObjectRef& ref) { VerilatorJSONSerializer serializer(func_name, func); serializer.serialize(); std::string graph_json = serializer.GetJSON(); - auto params = serializer.GetParams(); + + // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes + // a callback which calls backend::UpdateConstants to capture the map before the function + // 'disappears' into lowered form, on the assumption the visit order and thus constant + // names match those generated by the JSONSerializer. // Create runtime object - auto n = make_object(func_name, graph_json, params); + auto n = make_object(func_name, graph_json, + serializer.const_names()); // Get Verilator compiler options auto ctx = transform::PassContext::Current(); diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index af426e5c71cbf..faf9d2899fc3a 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -259,21 +259,31 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator>(); - for (auto param : params_) { - ret.params.emplace(std::make_pair( - param.first, - std::make_pair(static_cast(param_storage_ids_[param.first]), param.second))); + + // Collect any runtime modules generated by external codegen. + ret.external_mods = + lowered_mod->GetAttr>(tvm::attr::kExternalMods).value_or({}); + + // Collect any constants extracted by external codegen. + ret.params = std::unordered_map(); + Map const_name_to_constant = + lowered_mod->GetAttr>(tvm::attr::kConstNameToConstant) + .value_or({}); + for (const auto& kv : const_name_to_constant) { + VLOG(1) << "constant '" << kv.first << "' contributed by external codegen"; + ICHECK(ret.params.emplace(kv.first, kv.second).second); } - ret.function_metadata = std::move(function_metadata_); - Optional> external_modules = - lowered_mod->GetAttr>("external_mods"); - ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point."; + // Collect any constants extracted during lowering. + for (const auto& kv : params_) { + VLOG(1) << "constant '" << kv.first << "' contributed by TECompiler"; + ICHECK(ret.params.emplace(kv.first, kv.second).second); + } + + ret.function_metadata = std::move(function_metadata_); // This is the point where we separate the functions in the module by target ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod); - ret.external_mods = external_modules.value(); ret.metadata = ExecutorCodegenMetadata({} /* inputs */, {} /* input_tensor_types */, {} /* outputs */, {} /* output_tensor_types */, {} /* pools */, {} /* devices */, @@ -650,14 +660,7 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode { String key = args[0]; auto it = this->output_.params.find(key); CHECK(it != this->output_.params.end()) << "no such parameter " << key; - *rv = (*it).second.second; - }); - } else if (name == "get_param_id") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - String key = args[0]; - auto it = this->output_.params.find(key); - CHECK(it != this->output_.params.end()) << "no such parameter " << key; - *rv = (*it).second.first; + *rv = (*it).second; }); } else if (name == "get_irmodule") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 08fa18b61e164..210f77330afda 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -1224,7 +1224,7 @@ IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn pr // annotate the module with the resulting runtime modules. // TODO(mbs): runtime modules should be first class rather than attributes. Array external_mods = - module->GetAttr>("external_mods", Array()).value(); + module->GetAttr>(tvm::attr::kExternalMods).value_or({}); Array new_external_mods = compiler->LowerExternalFunctions(); VLOG(1) << "capturing " << external_mods.size() << " existing and " << new_external_mods.size() << " new external modules"; @@ -1246,7 +1246,7 @@ IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn pr device_contexts.Set(kv.first, kv.second); // copy-on-write. } - updated_module = WithAttrs(updated_module, {{"external_mods", std::move(external_mods)}, + updated_module = WithAttrs(updated_module, {{tvm::attr::kExternalMods, std::move(external_mods)}, {"device_contexts", std::move(device_contexts)}}); if (backend::IsAutoSchedulerEnabled()) { diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 67924a7835fb8..d6fae8c72b5e6 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -223,7 +223,11 @@ struct LoweredOutput { Map lowered_funcs; Array external_mods; Map function_metadata; - std::unordered_map> params; + /*! + * \brief Map from constant names (allocated by the codegen as constants are encountered) + * to the constant's value. + */ + std::unordered_map params; ExecutorCodegenMetadata metadata; }; @@ -249,7 +253,7 @@ struct ConstantUpdater : public ExprVisitor { void VisitExpr_(const ConstantNode* cn) final { std::string name = symbol_ + "_const_" + std::to_string(const_idx_++); - VLOG(1) << "Binding " << name << " to constant of type " << PrettyPrint(cn->checked_type()); + VLOG(1) << "binding '" << name << "' to constant of type " << PrettyPrint(cn->checked_type()); (*params_)[name] = cn->data; } diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 7371fd1f80838..a8bd3df32a90f 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -1166,11 +1166,27 @@ void VMCompiler::Codegen() { for (const auto& kv : per_tvm_target_modules) { ICHECK(kv.first->kind->device_type != kDLExtDev); } - Array ext_mods = - context_.module->GetAttr>("external_mods", Array()) - .value(); - VLOG(0) << "have " << per_tvm_target_modules.size() << " targets to build and " << ext_mods.size() - << " external runtime modules"; + + // Retrieve all external runtime modules accumulated by external codegen (both function-at-a-time + // and IRModule-at-a-time). + Array external_mods = + context_.module->GetAttr>(tvm::attr::kExternalMods).value_or({}); + + // Retrieve any constant bindings accumulated by external codegen (by IRModule-at-a-time passes). + Map const_name_to_constant = + context_.module->GetAttr>(tvm::attr::kConstNameToConstant) + .value_or({}); + + VLOG(0) << "have " << per_tvm_target_modules.size() << " targets to build, " + << external_mods.size() << " external runtime modules, " << const_name_to_constant.size() + << " external constants, and " << params_.size() << " local constants"; + + // Any constant bindings must be merged into the overall 'params' map we've directly accumulated + // via the TECompiler callback. + for (const auto& kv : const_name_to_constant) { + ICHECK_EQ(params_.count(kv.first), 0); + params_.emplace(kv.first, kv.second); + } runtime::Module lib; if (per_tvm_target_modules.empty()) { @@ -1183,7 +1199,7 @@ void VMCompiler::Codegen() { } lib = - codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target, + codegen::CreateMetadataModule(params_, lib, external_mods, config_->host_target, Runtime::Create("cpp"), Executor::Create("graph"), // DNS HACK relay::backend::ExecutorCodegenMetadata()); exec_->SetLib(lib); diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc index 1b0f002f1def0..0df9f5ee294c0 100644 --- a/src/relay/transforms/compiler_function_utils.cc +++ b/src/relay/transforms/compiler_function_utils.cc @@ -50,7 +50,7 @@ const FunctionNode* AsFunctionNode(const Expr& expr, const std::string& compiler } /*! - * \brief Rewrite calls to inlined "Compiler" functions to global functions. The given + * \brief Rewrite calls to inlined and let-bound "Compiler" functions to global functions. The given * module will be extended with the newly outlined functions. */ class Outliner : public MixedModeMutator { @@ -58,6 +58,38 @@ class Outliner : public MixedModeMutator { Outliner(GlobalSymbolCache* cache, std::string compiler_filter, IRModule mod) : cache_(cache), compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {} + Expr VisitExpr_(const LetNode* op) final { + auto pre_visit = [this](const LetNode* op) { + Expr var = this->VisitExpr(op->var); + Expr value = this->VisitExpr(op->value); + + if (AsFunctionNode(value, compiler_filter_)) { + // Inline on-the-fly if the let-bound value is a function of interest. + this->memo_[var] = value; + } + }; + auto post_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + Expr value = this->VisitExpr(op->value); + Expr body = this->VisitExpr(op->body); + auto expr = GetRef(op); + + if (AsFunctionNode(value, compiler_filter_)) { + // The let binding is no longer needed since inlined on-the-fly above. + this->memo_[expr] = this->VisitExpr(op->body); + } else { + Var var = Downcast(this->VisitExpr(op->var)); + if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) { + this->memo_[expr] = expr; + } else { + this->memo_[expr] = Let(var, value, body); + } + } + }; + ExpandANormalForm(op, pre_visit, post_visit); + return memo_[GetRef(op)]; + } + Expr Rewrite_(const CallNode* pre, const Expr& post) final { Call new_call = Downcast(post); if (const auto* function_node = AsFunctionNode(new_call->op, compiler_filter_)) { diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h index 6664594fc0a0e..aa98430318a69 100644 --- a/src/relay/transforms/compiler_function_utils.h +++ b/src/relay/transforms/compiler_function_utils.h @@ -95,9 +95,10 @@ class ExistingGlobalSymbolCache : public GlobalSymbolCache { }; /*! - * \brief A pass to outline all literal functions in direct call positions which have a "Compiler" - * attribute. The given \p GlobalSymbolCache is used to determine a unique global symbol for each - * function, which is also assigned to the "global_symbol" attribute of the new global function. + * \brief A pass to outline all let-bound and literal functions in direct call positions which have + * a "Compiler" attribute. The given \p GlobalSymbolCache is used to determine a unique global + * symbol for each function, which is also assigned to the "global_symbol" attribute of the new + * global function. * * At most one function with the same global symbol is outlined. * @@ -108,9 +109,9 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr cach std::string compiler_filter = ""); /*! - * \brief A pass to outline all literal functions in direct call positions which have a "Compiler" - * attribute. The functions are bound to unique global vars according to their existing - * "global_symbol" attribute. At most one function with the same global symbol is outlined. + * \brief A pass to outline all let-bound and literal functions in direct call positions which have + * a "Compiler" attribute. The functions are bound to unique global vars according to their + * existing "global_symbol" attribute. At most one function with the same global symbol is outlined. * * If \p compiler_filter is non-empty only functions with that as their attribute value are * outlined. diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc index 00953a1907e13..f52e95b2adbfc 100644 --- a/src/relay/transforms/target_hooks.cc +++ b/src/relay/transforms/target_hooks.cc @@ -148,7 +148,7 @@ class TargetHookVisitor : public MixedModeVisitor { Pass RelayToTIRTargetHook(CompilationConfig config) { auto pass_func = [config = std::move(config)](IRModule mod, const PassContext& pass_ctx) { - VLOG(1) << "Before:" << std::endl << PrettyPrint(mod); + VLOG(1) << "RelayToTIRTargetHook before:" << std::endl << PrettyPrint(mod); TargetHookVisitor target_hook_visitor(mod, config); std::vector custom_passes = target_hook_visitor.Visit(); for (const auto& custom_pass : custom_passes) { @@ -161,11 +161,14 @@ Pass RelayToTIRTargetHook(CompilationConfig config) { mod = custom_pass.pass(mod); } else { // Invoke the pass. + // Note that there may be a non-external codegen target in scope. Each custom pass + // must be prepared to handle this, eg by creating a default target instance if the + // current target is either null or of a generic kind such as 'cuda' or 'llvm'. VLOG(0) << "Invoking custom pass for target kind '" << custom_pass.target_kind_name << "'"; mod = custom_pass.pass(mod); } } - VLOG(1) << "After:" << std::endl << PrettyPrint(mod); + VLOG(1) << "RelayToTIRTargetHook after:" << std::endl << PrettyPrint(mod); return mod; }; return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIRTargetHook", {}); diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc index e5ca82d5c0996..ec301d10812fc 100644 --- a/src/target/metadata_module.cc +++ b/src/target/metadata_module.cc @@ -215,6 +215,8 @@ runtime::Module CreateMetadataModule( String symbol = pf_sym(); Array variables = pf_var(); for (size_t i = 0; i < variables.size(); i++) { + VLOG(1) << "From module of type '" << mod->type_key() << "' found const var '" + << variables[i] << "' for symbol '" << symbol << "'"; symbol_const_vars.push_back(variables[i].operator std::string()); } ICHECK_EQ(const_vars_by_symbol.count(symbol), 0U) << "Found duplicated symbol: " << symbol; diff --git a/src/tir/transforms/extract_constants.cc b/src/tir/transforms/extract_constants.cc index 237f923516dab..f9e620ba3322b 100644 --- a/src/tir/transforms/extract_constants.cc +++ b/src/tir/transforms/extract_constants.cc @@ -80,14 +80,14 @@ tvm::transform::Pass ExtractPrimFuncConstants() { } auto* attrs = m->attrs.CopyOnWrite(); ConstArrayType constant_array_ = - (attrs->dict.count(tvm::attr::kConstantsArray)) - ? Downcast(attrs->dict[tvm::attr::kConstantsArray]) + (attrs->dict.count(tvm::attr::kConstants)) + ? Downcast(attrs->dict[tvm::attr::kConstants]) : ConstArrayType(); Applicator a = Applicator(); func->body = a.Apply(func->body, constant_array_); const ConstArrayType constant_list = a.constant_array_; if (constant_list.size()) { - attrs->dict.Set(tvm::attr::kConstantsArray, constant_list); + attrs->dict.Set(tvm::attr::kConstants, constant_list); } return GetRef(func); }; diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py index 4f451a125184d..873475ac1ce77 100644 --- a/tests/python/relay/test_external_codegen.py +++ b/tests/python/relay/test_external_codegen.py @@ -235,37 +235,29 @@ def make_mod(): @pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now") -def test_extern_gcc_consts(): - @tvm._ffi.register_func("relay.ext.ccompiler.constant_updater") - def constant_updater(expr, symbol): - """A dummy constant updater just to test that a custom one works.""" - return {"ccompiler_0_p0": tvm.nd.array(y0_data)} - - x = relay.var("x", shape=(8, 8)) - y0_data = np.random.uniform(0, 1, (8, 8)).astype("float32") +@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result]) +def test_extern_gcc_consts(check_result): + shape = (8, 8) + dtype = "float32" + x = relay.var("x", shape=shape) + y0_data = np.random.uniform(0, 1, shape).astype(dtype) - x0 = relay.var("x0", shape=(8, 8)) - y0_const = relay.const(y0_data, "float32") + x0 = relay.var("x0", shape=shape) + y0_const = relay.const(y0_data, dtype) z = x0 + y0_const f = relay.Function([x0], z) f = set_external_func_attr(f, "ccompiler", "ccompiler_0") call = relay.Call(f, [x]) mod = tvm.IRModule.from_expr(call) - with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): - compiler = relay.backend.vm.VMCompiler() - compiler.lower(mod, "llvm") - compiler.codegen() - params = compiler.get_params() - assert len(params) == 1 - assert "ccompiler_0_p0" in params.keys() - - with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): - _, _, params = relay.build(mod, target="llvm") - assert len(params) == 1 - assert "ccompiler_0_p0" in params.keys() - - tvm._ffi.registry.remove_global_func("relay.ext.ccompiler.constant_updater") + # Note that while the VMCompiler get_params() will return all 'parameters' from both + # TVM and external codegen compiled code, the GraphExecutor.get_params() will return only + # those from non-external modules. So in the following we'll test by execution rather than + # test by inspection. + x_data = np.random.rand(*shape).astype(dtype) + inputs = {"x": x_data} + expected_result = x_data + y0_data + check_result(mod, inputs, shape, expected_result, target="llvm") @pytest.mark.skipif( diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py index 66abeff8ab29d..b1056f60b82ba 100644 --- a/tests/python/relay/transform/test_compiler_function_utils.py +++ b/tests/python/relay/transform/test_compiler_function_utils.py @@ -75,6 +75,39 @@ def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float1 ) +def original_mod_let_bound(): + return tvm.parser.parse( + """ + #[version = "0.0.5"] + def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) { + let %f = fn(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16], + Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] { + %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16], + PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] { + %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304); + add(%5, %FunctionVar_0_2) + }; + %4(%y_0_i0, %y_0_i1, %y_0_i2) + }; + %1 = %f(%x0, meta[relay.Constant][0], meta[relay.Constant][1]); + %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16], + Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] { + %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16], + PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] { + nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True) + }; + %6(%y_3_i0, %y_3_i1) + }; + %3 = %2(%x3, meta[relay.Constant][2]); + (%1, %3) + } + """, + "from_string", + None, + metatable, + ) + + def expected_outlined_mod(): return tvm.parser.parse( """ @@ -175,6 +208,13 @@ def test_outline_compiler_functions_with_existing_global_symbols(): tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True) +def test_outline_let_bound_compiler_functions_with_existing_global_symbols(): + actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols( + "cutlass" + )(original_mod_let_bound()) + tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True) + + def test_mark_compiler_functions_as_extern(): actual_extern_mod = tvm.relay.transform.MarkCompilerFunctionsAsExtern("cutlass")( expected_outlined_mod() diff --git a/tests/python/unittest/test_custom_datatypes.py b/tests/python/unittest/test_custom_datatypes.py index b135973718bca..e3cff18c51f87 100644 --- a/tests/python/unittest/test_custom_datatypes.py +++ b/tests/python/unittest/test_custom_datatypes.py @@ -21,6 +21,7 @@ import pytest import tvm import tvm.topi.testing +import tvm.testing from tvm import relay from tvm.relay.testing.layers import batch_norm_infer from tvm.target.datatype import ( @@ -560,4 +561,4 @@ def test_posites2(): if __name__ == "__main__": - pytest.main([__file__]) + tvm.testing.main() diff --git a/tests/python/unittest/test_tir_transform_extract_constants.py b/tests/python/unittest/test_tir_transform_extract_constants.py index cb49e7286fbbf..82f4f6515c09d 100644 --- a/tests/python/unittest/test_tir_transform_extract_constants.py +++ b/tests/python/unittest/test_tir_transform_extract_constants.py @@ -18,6 +18,7 @@ import tvm from tvm import tir from tvm.script import tir as T +import tvm.testing @tvm.script.ir_module @@ -49,7 +50,7 @@ def constant3(a: T.handle) -> None: def test_const_extraction(): mod = tvm.tir.transform.ExtractPrimFuncConstants()(Module4) - constants = mod.attrs["Constants"] + constants = mod.attrs["constants"] assert len(constants) == 2 def _visit(stmt): @@ -63,4 +64,4 @@ def _visit(stmt): if __name__ == "__main__": - test_const_extraction() + tvm.testing.main() From 6a86c97889a81e856e610a81d2bb20852cda8932 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 30 Jun 2022 11:51:34 -0700 Subject: [PATCH 026/111] [skip ci] Disable flaky test `test_empty_like` (#11968) See #11967 Co-authored-by: driazati --- tests/python/frontend/pytorch/test_forward.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index f039a00f5d91d..d411d9c874d4e 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -3820,6 +3820,7 @@ def test_func(): verify_model_with_input(test_func, []) +@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11967") def test_empty_like(): def test_func(data): return torch.empty_like(data) From 288b983b827d12539e23eda4fd9ec1ddd2c8cd1a Mon Sep 17 00:00:00 2001 From: Florin Blanaru Date: Thu, 30 Jun 2022 20:50:21 +0100 Subject: [PATCH 027/111] [CI] Skip some additional tests that are failing in the wheel (#11969) This PR skips some additional tests that are failing in the nightly wheel. --- python/tvm/testing/utils.py | 3 ++- tests/python/ci/test_ci.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 96275e2af66f4..054257e07aa29 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -97,7 +97,8 @@ def test_something(): IS_IN_CI = os.getenv("CI", "") == "true" skip_if_wheel_test = pytest.mark.skipif( - os.getenv("WHEEL_TEST") is not None, reason="Test not supported in wheel." + os.getenv("WHEEL_TEST", "").lower() in {"true", "1", "yes"}, + reason="Test not supported in wheel.", ) diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 27297e165fd64..0ed3f17015064 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -33,6 +33,7 @@ def parameterize_named(*values): return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values]) +@tvm.testing.skip_if_wheel_test @pytest.mark.parametrize( "target_url,base_url,commit_sha,expected_url,expected_body", [ @@ -826,6 +827,7 @@ def run(source_type, data, check): ) +@tvm.testing.skip_if_wheel_test @parameterize_named( dict( tlcpackstaging_body={ From 2625878abef4bc78da65918a8a8c1db441638e8b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 30 Jun 2022 12:57:10 -0700 Subject: [PATCH 028/111] [ci][docker] Nightly Docker image update (#11857) This bumps the Docker images to the latest versions from Docker Hub. Co-authored-by: tvm-bot <95660001+tvm-bot@users.noreply.github.com> --- Jenkinsfile | 16 ++++++++-------- ci/jenkins/Jenkinsfile.j2 | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 07c7f0c44aa19..513f11eaaf535 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -49,14 +49,14 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> -ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e' -ci_gpu = 'tlcpack/ci-gpu:20220619-055908-9bba7580b' -ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69' -ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e' -ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e' -ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e' -ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e' -ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8' +ci_lint = 'tlcpack/ci-lint:20220630-060117-558ba99c7' +ci_gpu = 'tlcpack/ci-gpu:20220630-060117-558ba99c7' +ci_cpu = 'tlcpack/ci-cpu:20220630-060117-558ba99c7' +ci_wasm = 'tlcpack/ci-wasm:20220630-060117-558ba99c7' +ci_i386 = 'tlcpack/ci-i386:20220630-060117-558ba99c7' +ci_qemu = 'tlcpack/ci-qemu:20220630-060117-558ba99c7' +ci_arm = 'tlcpack/ci-arm:20220630-060117-558ba99c7' +ci_hexagon = 'tlcpack/ci-hexagon:20220630-060117-558ba99c7' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 index 6f2f6a437044d..22cd5b6bfc845 100644 --- a/ci/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -51,14 +51,14 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils {% import 'ci/jenkins/macros.j2' as m with context -%} // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> -ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e' -ci_gpu = 'tlcpack/ci-gpu:20220619-055908-9bba7580b' -ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69' -ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e' -ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e' -ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e' -ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e' -ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8' +ci_lint = 'tlcpack/ci-lint:20220630-060117-558ba99c7' +ci_gpu = 'tlcpack/ci-gpu:20220630-060117-558ba99c7' +ci_cpu = 'tlcpack/ci-cpu:20220630-060117-558ba99c7' +ci_wasm = 'tlcpack/ci-wasm:20220630-060117-558ba99c7' +ci_i386 = 'tlcpack/ci-i386:20220630-060117-558ba99c7' +ci_qemu = 'tlcpack/ci-qemu:20220630-060117-558ba99c7' +ci_arm = 'tlcpack/ci-arm:20220630-060117-558ba99c7' +ci_hexagon = 'tlcpack/ci-hexagon:20220630-060117-558ba99c7' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images From 6424f1fec174557510bf94edb4882e952f3e9541 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Thu, 30 Jun 2022 15:14:55 -0700 Subject: [PATCH 029/111] [Hexagon] Disable broken test on physical device (#11960) --- .../contrib/test_hexagon/test_thread_pool.py | 5 ++++- .../topi/test_add_subtract_multiply.py | 10 +++++----- .../test_hexagon/topi/test_argmax_slice.py | 4 ++++ .../test_hexagon/topi/test_avg_pool2d_slice.py | 5 ++--- .../test_hexagon/topi/test_batch_matmul.py | 1 - .../contrib/test_hexagon/topi/test_clip.py | 1 - .../test_hexagon/topi/test_conv2d_nchw.py | 2 -- .../test_hexagon/topi/test_conv2d_nhwc.py | 2 -- .../test_hexagon/topi/test_conv2d_transpose.py | 4 ++++ .../contrib/test_hexagon/topi/test_dense.py | 1 - .../test_hexagon/topi/test_depthwise_conv2d.py | 4 +++- .../contrib/test_hexagon/topi/test_pooling.py | 2 -- .../contrib/test_hexagon/topi/test_reduce.py | 4 ---- .../contrib/test_hexagon/topi/test_resize2d.py | 17 +++++++---------- .../contrib/test_hexagon/topi/test_softmax.py | 1 - .../{ => topi}/test_softmax_slice.py | 18 ++++++++---------- 16 files changed, 37 insertions(+), 44 deletions(-) rename tests/python/contrib/test_hexagon/{ => topi}/test_softmax_slice.py (91%) diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py index d95c4120b7758..fa53cdc068c3c 100644 --- a/tests/python/contrib/test_hexagon/test_thread_pool.py +++ b/tests/python/contrib/test_hexagon/test_thread_pool.py @@ -16,7 +16,6 @@ # under the License. import numpy as np -import pytest import tvm import tvm.contrib.hexagon @@ -92,3 +91,7 @@ def test_elemwise_sum_parallel(hexagon_session: Session): (a, b, c, n) = generate_add_test_data(hexagon_session) mod["elemwise_sum_parallel"](a, b, c, n) tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py index 4d595f7e82e0a..0d8126072955a 100755 --- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py +++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py @@ -19,11 +19,8 @@ import pytest import numpy as np -from tvm import te, topi - -import tvm.testing -from tvm.topi import testing -from tvm.contrib.hexagon.build import HexagonLauncher +import tvm +from tvm import te import tvm.topi.hexagon.slice_ops as sl from ..infrastructure import allocate_hexagon_array, transform_numpy @@ -161,6 +158,9 @@ def test_transform( input_B_layout, op_name, ): + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") + target_hexagon = tvm.target.hexagon("v69") A = te.placeholder(input_shape_A, name="A", dtype=dtype) B = te.placeholder(input_shape_B, name="B", dtype=dtype) diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py index 4cbd524f4abfc..5431054d2ca22 100644 --- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """ Tests for Hexagon slice argmax op """ +import pytest import numpy as np import tvm @@ -76,6 +77,9 @@ def test_argmax_slice( working_scope, ): """Top level testing function for argmax""" + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") + target_hexagon = tvm.target.hexagon("v69") target = tvm.target.Target(target_hexagon, host=target_hexagon) argmax_input = te.placeholder(input_shape, name="A", dtype=dtype) diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py index 3154f7d7e729a..5b1f59c897d37 100644 --- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py @@ -18,8 +18,7 @@ import pytest import numpy as np -from tvm import te, topi - +from tvm import te import tvm.testing from tvm.topi import testing from tvm.contrib.hexagon.build import HexagonLauncher @@ -369,4 +368,4 @@ def test_avg_pool2d_slice( if __name__ == "__main__": - sys.exit(pytest.main(sys.argv)) + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py index 467ebd06b9cbb..c644773439432 100644 --- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py +++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py @@ -17,7 +17,6 @@ """Test code for matmul""" import numpy as np import pytest -import sys import tvm import tvm.testing diff --git a/tests/python/contrib/test_hexagon/topi/test_clip.py b/tests/python/contrib/test_hexagon/topi/test_clip.py index 37146b55dc1ee..ac6890171dba9 100755 --- a/tests/python/contrib/test_hexagon/topi/test_clip.py +++ b/tests/python/contrib/test_hexagon/topi/test_clip.py @@ -17,7 +17,6 @@ # pylint: disable=invalid-name -import pytest import numpy as np from tvm import te, topi diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py index c755a4d018f3b..01c20601b6854 100644 --- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py @@ -16,8 +16,6 @@ # under the License. """Test code for convolution.""" import numpy as np -import pytest -import sys import tvm import tvm.testing diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py index 96062aa1b493e..9acffff358e86 100644 --- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py @@ -16,8 +16,6 @@ # under the License. """Test code for convolution.""" import numpy as np -import pytest -import sys import tvm import tvm.testing diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py index 629403965eae8..8536603a3c209 100644 --- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py @@ -154,3 +154,7 @@ class TestConv2DTranspose(BaseConv2DTransposeTests): padding = tvm.testing.parameter((0, 0, 0, 0)) output_padding = tvm.testing.parameter((0, 0)) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py index 967278251cfc3..929108bb1492c 100644 --- a/tests/python/contrib/test_hexagon/topi/test_dense.py +++ b/tests/python/contrib/test_hexagon/topi/test_dense.py @@ -17,7 +17,6 @@ """Test code for dense""" import numpy as np import pytest -import sys import tvm import tvm.testing diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py index 63ae0e7b3253b..5e09e691f743c 100644 --- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py +++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py @@ -18,7 +18,6 @@ import sys import numpy as np -import pytest import tvm from tvm.contrib.hexagon.session import Session @@ -296,3 +295,6 @@ class TestDepthwiseConv2D(BaseDepthwiseConv2D): # TODO(hexagon-team): add TestDepthwiseConv2D_NCHWc test. + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py index ededdad2673be..45e558e1b6ddc 100644 --- a/tests/python/contrib/test_hexagon/topi/test_pooling.py +++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py @@ -16,8 +16,6 @@ # under the License. """Test code for pooling""" import numpy as np -import pytest -import sys import tvm import tvm.testing diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py index c806964545cac..a844e1d51206e 100644 --- a/tests/python/contrib/test_hexagon/topi/test_reduce.py +++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py @@ -16,15 +16,11 @@ # under the License. """Test code for reduce""" import numpy as np -import pytest -import sys import tvm -import tvm.testing from tvm import topi from tvm import te from tvm.contrib.hexagon.session import Session -import tvm.topi.testing in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters( diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py index caedc7b7b381d..109eb5c4365d6 100755 --- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py +++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py @@ -14,16 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - - import pytest import numpy as np -from tvm import te, topi - -import tvm.testing -from tvm.topi import testing -from tvm.contrib.hexagon.build import HexagonLauncher +import tvm +from tvm import te +from tvm.topi.testing import resize2d_python import tvm.topi.hexagon as s1 from ..infrastructure import allocate_hexagon_array, transform_numpy @@ -34,9 +30,7 @@ def expected_output_np( ): scale_h = out_height / in_height scale_w = out_width / in_width - return tvm.topi.testing.resize2d_python( - input_np, (scale_h, scale_w), layout, method, coord_trans - ) + return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans) @tvm.testing.fixture @@ -108,6 +102,9 @@ def test_resize2d( method, hexagon_session, ): + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") + target_hexagon = tvm.target.hexagon("v69") A = te.placeholder(input_shape, name="A", dtype=dtype) diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py index 7a2435e8dcca8..d1c78842b5ff6 100644 --- a/tests/python/contrib/test_hexagon/topi/test_softmax.py +++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py @@ -17,7 +17,6 @@ """Test code for softmax""" import numpy as np import pytest -import sys import tvm import tvm.testing diff --git a/tests/python/contrib/test_hexagon/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py similarity index 91% rename from tests/python/contrib/test_hexagon/test_softmax_slice.py rename to tests/python/contrib/test_hexagon/topi/test_softmax_slice.py index a4745d62a7ab0..a39c6cd5163bc 100644 --- a/tests/python/contrib/test_hexagon/test_softmax_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py @@ -14,17 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - import pytest import numpy as np -from tvm import te, topi - -import tvm.testing -from tvm.topi import testing -from tvm.contrib.hexagon.build import HexagonLauncher +import tvm +from tvm import te +from tvm.topi.testing import softmax_python import tvm.topi.hexagon.slice_ops as sl -from .infrastructure import allocate_hexagon_array +from ..infrastructure import allocate_hexagon_array def transform_numpy(arr_np, layout): @@ -63,7 +60,7 @@ class TestSoftmax2d(Basesoftmax2d): @tvm.testing.fixture def expected_output_np(self, input_np): if len(input_np.shape) == 2: - ref_np_2d = tvm.topi.testing.softmax_python(input_np) + ref_np_2d = softmax_python(input_np) return ref_np_2d raise RuntimeError(f"Unexpected input shape '{input_np.shape}'") @@ -82,6 +79,8 @@ def test_softmax_f32( axis_sep, hexagon_session, ): + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") target_hexagon = tvm.target.hexagon( "v69", @@ -136,5 +135,4 @@ def test_softmax_f32( if __name__ == "__main__": - - sys.exit(pytest.main(sys.argv)) + tvm.testing.main() From 26ad70333875c55eec438b840d004a5fb9255572 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 30 Jun 2022 16:16:58 -0700 Subject: [PATCH 030/111] [MetaSchedule] Handle 'warp_execution' in RewriteCooperativeFetch (#11955) Updated `RewriteCooperativeFetch` to handle 'warp_execution' annotation when the extend of `threadIdx.x` is not specified --- .../postproc/rewrite_cooperative_fetch.cc | 33 +++- ...dule_postproc_rewrite_cooperative_fetch.py | 151 +++++++++++++++++- 2 files changed, 182 insertions(+), 2 deletions(-) diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index 798f00423f7bf..d111bdb42abb6 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -65,6 +65,23 @@ Optional ParseAnnotate(const Schedule& sch, const Instruction& inst, return Downcast(inst->inputs[0]); } +/*! + * \brief Parse instruction: sch.annotate(..., attr::warp_execution) + * \param sch The schedule + * \param inst The instruction to be parsed + * \return Whether ths parsing is successful + */ +bool ParseWarpExecutionAnn(const Schedule& sch, const Instruction& inst) { + static InstructionKind inst_kind_annotate = InstructionKind::Get("Annotate"); + if (!inst->kind.same_as(inst_kind_annotate)) { + return false; + } + ICHECK_EQ(inst->inputs.size(), 2); + ICHECK_EQ(inst->attrs.size(), 1); + String ann_key = Downcast(inst->attrs[0]); + return ann_key == attr::warp_execution; +} + } // namespace tir namespace meta_schedule { @@ -76,7 +93,14 @@ namespace meta_schedule { class RewriteCooperativeFetchNode : public PostprocNode { public: // Inherited from PostprocNode - void InitializeWithTuneContext(const TuneContext& context) final {} + void InitializeWithTuneContext(const TuneContext& context) final { + if (Optional v = context->target.value()->GetAttr("thread_warp_size")) { + this->thread_warp_size_ = v.value()->value; + } else { + TVM_PY_LOG(INFO, context->logging_func) << "'thread_warp_size' is not defined in the target"; + } + } + // Inherited from PostprocNode bool Apply(const tir::Schedule& sch) final; @@ -84,6 +108,9 @@ class RewriteCooperativeFetchNode : public PostprocNode { static constexpr const char* _type_key = "meta_schedule.RewriteCooperativeFetch"; TVM_DECLARE_FINAL_OBJECT_INFO(RewriteCooperativeFetchNode, PostprocNode); + + private: + int thread_warp_size_ = -1; }; bool RewriteCooperativeFetchNode::Apply(const tir::Schedule& sch) { @@ -101,6 +128,10 @@ bool RewriteCooperativeFetchNode::Apply(const tir::Schedule& sch) { thread_extent_y = new_thread_extent.value()->value; continue; } + if (tir::ParseWarpExecutionAnn(sch, inst)) { + thread_extent_x = thread_warp_size_; + continue; + } Optional opt_block_rv = tir::ParseAnnotate(sch, inst, &vector_lane); if (!opt_block_rv.defined()) { continue; diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py index 5460c5900946b..e55f693e72d3f 100644 --- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py +++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py @@ -17,6 +17,7 @@ # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm +import tvm.testing from tvm import tir from tvm.meta_schedule import TuneContext from tvm.meta_schedule.postproc import RewriteCooperativeFetch @@ -99,6 +100,108 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None: C[v0, v1] = C_local[v0, v1] +@tvm.script.ir_module +class WarpExecutionAfterRewrite: + @T.prim_func + def main( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + C: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_local = T.alloc_buffer([512, 512], dtype="float32", scope="local") + A_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared") + B_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared") + for i0_0_i1_0_fused in T.thread_binding(0, 16, thread="blockIdx.x"): + for i0_1_i1_1_fused in T.thread_binding(0, 16, thread="vthread.x"): + for i0_2_i1_2_fused in T.thread_binding(0, 8, thread="threadIdx.y"): + for i2_0 in T.serial(0, 1): + for ax0_ax1_fused_0 in T.serial(0, 1024): + for ax0_ax1_fused_1 in T.thread_binding(0, 8, thread="threadIdx.y"): + for ax0_ax1_fused_2 in T.thread_binding( + 0, 32, thread="threadIdx.x" + ): + with T.block("A_shared"): + v0 = T.axis.spatial( + 512, + ( + ax0_ax1_fused_0 * 256 + + ax0_ax1_fused_1 * 32 + + ax0_ax1_fused_2 + ) + // 512, + ) + v1 = T.axis.spatial( + 512, + ( + ax0_ax1_fused_0 * 256 + + ax0_ax1_fused_1 * 32 + + ax0_ax1_fused_2 + ) + % 512, + ) + T.reads([A[v0, v1]]) + T.writes([A_shared[v0, v1]]) + A_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused_0 in T.serial(0, 32): + for ax0_ax1_fused_1 in T.thread_binding(0, 8, thread="threadIdx.y"): + for ax0_ax1_fused_2 in T.thread_binding( + 0, 32, thread="threadIdx.x" + ): + for ax0_ax1_fused_3 in T.vectorized(0, 2): + with T.block("B_shared"): + v0 = T.axis.spatial( + 512, + ( + ax0_ax1_fused_0 * 512 + + ax0_ax1_fused_1 * 64 + + ax0_ax1_fused_2 * 2 + + ax0_ax1_fused_3 + ) + // 32, + ) + v1 = T.axis.spatial( + 512, + i0_0_i1_0_fused * 32 + + ( + ax0_ax1_fused_0 * 512 + + ax0_ax1_fused_1 * 64 + + ax0_ax1_fused_2 * 2 + + ax0_ax1_fused_3 + ) + % 32, + ) + T.reads([B[v0, v1]]) + T.writes([B_shared[v0, v1]]) + B_shared[v0, v1] = B[v0, v1] + for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(16, 2, 2, 32, 16, 2): + with T.block("C"): + i = T.axis.spatial(512, i0_1_i1_1_fused * 32 + i0_3 * 16 + i0_4) + j = T.axis.spatial( + 512, + i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + i1_3 * 2 + i1_4, + ) + k = T.axis.reduce(512, i2_0 * 512 + i2_1 * 32 + i2_2) + T.reads([A_shared[i, k], B_shared[k, j]]) + T.writes([C_local[i, j]]) + T.block_attr({"warp_execution": 1}) + with T.init(): + C_local[i, j] = T.float32(0) + C_local[i, j] = C_local[i, j] + A_shared[i, k] * B_shared[k, j] + for ax0, ax1 in T.grid(32, 4): + with T.block("C_local"): + v0 = T.axis.spatial(512, i0_1_i1_1_fused * 32 + ax0) + v1 = T.axis.spatial( + 512, i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + ax1 + ) + T.reads([C_local[v0, v1]]) + T.writes([C[v0, v1]]) + C[v0, v1] = C_local[v0, v1] + + # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks # fmt: on @@ -147,5 +250,51 @@ def test_rewrite_cooperative_fetch(): tvm.ir.assert_structural_equal(sch.mod, AfterRewrite0) +def test_rewrite_warp_execution(): + mod = create_prim_func(te_workload.matmul(n=512, m=512, k=512)) + target = _target() + ctx = _create_context(mod, target) + + sch = tir.Schedule(mod, debug_mask="all") + # fmt: off + # pylint: disable=line-too-long,invalid-name + b0 = sch.get_block(name="C", func_name="main") + b1 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local") + l2, l3, l4 = sch.get_loops(block=b0) + sch.annotate(b0, "warp_execution", 1) + v5, v6, v7, v8, v9 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64, decision=[1, 16, 1, 2, 16]) + l10, l11, l12, l13, l14 = sch.split(loop=l2, factors=[v5, v6, v7, v8, v9]) + v15, v16, v17, v18, v19 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[16, 1, 8, 2, 2]) + l20, l21, l22, l23, l24 = sch.split(loop=l3, factors=[v15, v16, v17, v18, v19]) + v25, v26, v27 = sch.sample_perfect_tile(loop=l4, n=3, max_innermost_factor=64, decision=[1, 16, 32]) + l28, l29, l30 = sch.split(loop=l4, factors=[v25, v26, v27]) + sch.reorder(l10, l20, l11, l21, l12, l22, l28, l29, l13, l23, l30, l14, l24) + l31 = sch.fuse(l10, l20) + sch.bind(loop=l31, thread_axis="blockIdx.x") + l32 = sch.fuse(l11, l21) + sch.bind(loop=l32, thread_axis="vthread.x") + l33 = sch.fuse(l12, l22) + sch.bind(loop=l33, thread_axis="threadIdx.y") + b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared") + sch.compute_at(block=b34, loop=l28, preserve_unit_loops=True) + _, _, _, _, l39, l40 = sch.get_loops(block=b34) + l41 = sch.fuse(l39, l40) + _, v43 = sch.sample_perfect_tile(loop=l41, n=2, max_innermost_factor=4, decision=[262144, 1]) + sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v43) + b44 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared") + sch.compute_at(block=b44, loop=l28, preserve_unit_loops=True) + _, _, _, _, l49, l50 = sch.get_loops(block=b44) + l51 = sch.fuse(l49, l50) + _, v53 = sch.sample_perfect_tile(loop=l51, n=2, max_innermost_factor=4, decision=[8192, 2]) + sch.annotate(block_or_loop=b44, ann_key="meta_schedule.cooperative_fetch", ann_val=v53) + sch.reverse_compute_at(block=b1, loop=l33, preserve_unit_loops=True) + # pylint: enable=line-too-long,invalid-name + # fmt: on + sch.enter_postproc() + assert ctx.postprocs[0].apply(sch) + print(sch.mod["main"].script()) + tvm.ir.assert_structural_equal(sch.mod, WarpExecutionAfterRewrite) + + if __name__ == "__main__": - test_rewrite_cooperative_fetch() + tvm.testing.main() From beea0d2d6add545bd27130309aa12b8e7a38100f Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Thu, 30 Jun 2022 16:24:42 -0700 Subject: [PATCH 031/111] [MetaSchedule] Fix Task Extraction (#11954) --- python/tvm/meta_schedule/__init__.py | 6 ++++- python/tvm/meta_schedule/relay_integration.py | 24 ++++++++++++++++++- python/tvm/meta_schedule/tune.py | 5 +++- python/tvm/relay/backend/te_compiler.py | 5 ++-- python/tvm/relay/op/strategy/cuda.py | 8 +++---- src/meta_schedule/database/json_database.cc | 2 +- src/relay/backend/te_compiler.cc | 1 + 7 files changed, 40 insertions(+), 11 deletions(-) diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py index eb40b32e7c29a..f60d0a5490f5d 100644 --- a/python/tvm/meta_schedule/__init__.py +++ b/python/tvm/meta_schedule/__init__.py @@ -33,7 +33,11 @@ from .apply_history_best import ApplyHistoryBest from .extracted_task import ExtractedTask from .profiler import Profiler -from .relay_integration import extract_task_from_relay, is_meta_schedule_enabled +from .relay_integration import ( + extract_task_from_relay, + is_meta_schedule_dispatch_enabled, + is_meta_schedule_enabled, +) from .search_strategy import MeasureCandidate from .tune import TuneConfig, tune_extracted_tasks, tune_relay, tune_te, tune_tir from .tune_context import TuneContext diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py index 707b469aa4568..bd12ac350a61f 100644 --- a/python/tvm/meta_schedule/relay_integration.py +++ b/python/tvm/meta_schedule/relay_integration.py @@ -70,6 +70,7 @@ def extract_task_from_relay( The tasks extracted from this network """ # pylint: disable=import-outside-toplevel + from tvm import autotvm from tvm.relay import Function as RelayFunc # pylint: enable=import-outside-toplevel @@ -102,7 +103,14 @@ def extract_task_from_relay( config=pass_config, disabled_pass=disabled_pass, ): - return list(extract_task_func(mod, target, relay_params, te_filter_func)) + if target.kind.name != "cuda" and isinstance( + autotvm.DispatchContext.current, autotvm.FallbackContext + ): + tophub_context = autotvm.tophub.context(target) + else: + tophub_context = autotvm.utils.EmptyContext() + with tophub_context: + return list(extract_task_func(mod, target, relay_params, te_filter_func)) def is_meta_schedule_enabled() -> bool: @@ -117,3 +125,17 @@ def is_meta_schedule_enabled() -> bool: "relay.backend.use_meta_schedule", False, ) + + +def is_meta_schedule_dispatch_enabled() -> bool: + """Return whether the meta-schedule dispatch is enabled. + + Returns + ------- + enabled: bool + Whether the meta schedule is enabled + """ + return transform.PassContext.current().config.get( + "relay.backend.use_meta_schedule_dispatch", + False, + ) diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index cd40429d16840..bc2e7096c6ef8 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -592,6 +592,9 @@ def tune_relay( with target, autotvm_silencer(), ApplyHistoryBest(database): with PassContext( opt_level=3, - config={"relay.backend.use_meta_schedule": True}, + config={ + "relay.backend.use_meta_schedule": True, + "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda", + }, ): return relay_build(mod, target=target, params=params) diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py index 3c87f45b8f7db..a2fbf555e12b0 100644 --- a/python/tvm/relay/backend/te_compiler.py +++ b/python/tvm/relay/backend/te_compiler.py @@ -23,7 +23,8 @@ import numpy as np import tvm from tvm import autotvm, te -from tvm.ir.transform import PassContext +from tvm.auto_scheduler import is_auto_scheduler_enabled +from tvm.meta_schedule import is_meta_schedule_dispatch_enabled from tvm.runtime import Object from tvm.support import libinfo from tvm.target import Target @@ -180,7 +181,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True) # Disable autotvm if auto_scheduler is enabled. # (i.e., always return the implementation with the highest priority for auto-scheduler). - if PassContext.current().config.get("relay.backend.use_auto_scheduler", False): + if is_auto_scheduler_enabled() or is_meta_schedule_dispatch_enabled(): use_autotvm = False # If not use autotvm, always return the implementation with the highest priority diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 072b958da213d..9c4a896d572db 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -252,9 +252,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target): ) # register auto-scheduler implementations - if ( - is_auto_scheduler_enabled() or is_meta_schedule_enabled() - ) and judge_winograd_auto_scheduler: + if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm @@ -545,7 +543,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) - if is_auto_scheduler_enabled() or is_meta_schedule_enabled(): + if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform), naive_schedule, # this implementation should never be picked by autotvm @@ -823,7 +821,7 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target): """Matmul cuda strategy.""" strategy = _op.OpStrategy() - if is_auto_scheduler_enabled() or is_meta_schedule_enabled(): + if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul), naive_schedule, diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index 23ecb121f4999..5e7c9119c95ac 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -204,7 +204,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record, LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1) << " of file " << path_tuning_record << ". The workload is:\n" << (workload.defined() ? tir::AsTVMScript(workload) : "(null)") - << "\nThe JSONObject of TuningRecrod is:\n" + << "\nThe JSONObject of TuningRecord is:\n" << json_obj << "\nThe error message is:\n" << e.what(); } diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 210f77330afda..8ca5a32b7fb98 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -552,6 +552,7 @@ TECompiler& TECompiler::Global() { } TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule_dispatch", Bool); TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() { return TECompiler::Global(); From ec39199edb72dfe93747249d6a060c1832a8e38f Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Thu, 30 Jun 2022 17:07:43 -0700 Subject: [PATCH 032/111] [PyTorch] [Relay] Add l1 and mse loss function for pytorch frontend (#11978) * add l1 and mse loss function for pytorch frontend * fix CI --- python/tvm/relay/frontend/pytorch.py | 33 ++++++++++++++++- python/tvm/topi/nn/softmax.py | 4 +-- tests/python/frontend/pytorch/test_forward.py | 36 +++++++++++++++++++ 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 6fe8c89e3c2df..123b0299839e0 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -932,6 +932,35 @@ def cross_entropy_loss_with_logits(self, inputs, input_types): assert weights is None, "weight not supported in cross_entropy_loss" return _op.nn.cross_entropy_with_logits(_op.nn.log_softmax(input), target) + def l1_loss(self, inputs, input_types): + assert len(inputs) == 3 + [predictions, targets, reduction] = inputs + delta = _op.abs(_op.subtract(predictions, targets)) + if reduction == 0: + # reduction = "none" + return delta + elif reduction == 1: + # reduction = "mean" + return _op.mean(delta) + else: + # reduction = "sum" + return _op.sum(delta) + + def mse_loss(self, inputs, input_types): + assert len(inputs) == 3 + [predictions, targets, reduction] = inputs + delta = _op.subtract(predictions, targets) + delta = _op.power(delta, _expr.const(2, input_types[0])) + if reduction == 0: + # reduction = "none" + return delta + elif reduction == 1: + # reduction = "mean" + return _op.mean(delta) + else: + # reduction = "sum" + return _op.sum(delta) + def hard_sigmoid(self, inputs, input_types): def _relu6(x): return _op.tensor.clip(x, 0.0, 6.0) @@ -3200,7 +3229,6 @@ def create_convert_map(self): "aten::silu": self.silu, "aten::glu": self.glu, "aten::log_sigmoid": self.log_sigmoid, - "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits, "aten::adaptive_avg_pool1d": functools.partial( self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d ), @@ -3374,6 +3402,9 @@ def create_convert_map(self): "aten::nll_loss": self.nll_loss, "aten::nll_loss2d": self.nll_loss, "aten::nll_loss_nd": self.nll_loss, + "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits, + "aten::l1_loss": self.l1_loss, + "aten::mse_loss": self.mse_loss, "aten::flip": self.flip, "aten::gru": self.gru, "aten::lstm": self.lstm, diff --git a/python/tvm/topi/nn/softmax.py b/python/tvm/topi/nn/softmax.py index 2d6921b26dfad..83a4995744c70 100644 --- a/python/tvm/topi/nn/softmax.py +++ b/python/tvm/topi/nn/softmax.py @@ -129,12 +129,12 @@ def log_softmax(x, axis=-1): Parameters ---------- data : tvm.te.Tensor - 2-D input data + N-D input data Returns ------- output : tvm.te.Tensor - 2-D output with same shape + N-D output with same shape """ shape = x.shape if axis < 0: diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index d411d9c874d4e..4f42c183b66a8 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -4177,6 +4177,42 @@ def test_cross_entropy_loss(): verify_model(torch.nn.CrossEntropyLoss().eval(), input_data=[predictions, targets]) +def test_forward_l1_loss(): + torch.set_grad_enabled(False) + N, C = 10, 3 + predictions = torch.rand((N, C)).float() + targets = torch.rand((N, C)).float() + verify_model(torch.nn.L1Loss().eval(), input_data=[predictions, targets]) + verify_model(torch.nn.L1Loss(reduction="sum").eval(), input_data=[predictions, targets]) + verify_model(torch.nn.L1Loss(reduction="none").eval(), input_data=[predictions, targets]) + + # multidimension l1 loss + d1, d2 = 2, 3 + predictions = torch.rand((N, C, d1, d2)).float() + targets = torch.rand((N, C, d1, d2)).float() + verify_model(torch.nn.L1Loss().eval(), input_data=[predictions, targets]) + verify_model(torch.nn.L1Loss(reduction="sum").eval(), input_data=[predictions, targets]) + verify_model(torch.nn.L1Loss(reduction="none").eval(), input_data=[predictions, targets]) + + +def test_forward_mse_loss(): + torch.set_grad_enabled(False) + N, C = 10, 3 + predictions = torch.rand((N, C)).float() + targets = torch.rand((N, C)).float() + verify_model(torch.nn.MSELoss().eval(), input_data=[predictions, targets]) + verify_model(torch.nn.MSELoss(reduction="sum").eval(), input_data=[predictions, targets]) + verify_model(torch.nn.MSELoss(reduction="none").eval(), input_data=[predictions, targets]) + + # multidimension mse loss + d1, d2 = 2, 3 + predictions = torch.rand((N, C, d1, d2)).float() + targets = torch.rand((N, C, d1, d2)).float() + verify_model(torch.nn.MSELoss().eval(), input_data=[predictions, targets]) + verify_model(torch.nn.MSELoss(reduction="sum").eval(), input_data=[predictions, targets]) + verify_model(torch.nn.MSELoss(reduction="none").eval(), input_data=[predictions, targets]) + + @tvm.testing.uses_gpu def test_forward_flip(): torch.set_grad_enabled(False) From 395e91ff54543864a90240d18c8efd8c277c758b Mon Sep 17 00:00:00 2001 From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com> Date: Thu, 30 Jun 2022 19:36:13 -0700 Subject: [PATCH 033/111] [MetaSchedule] Extract workload embedding (#11975) This PR enables extracting the embeddings of the workload in a tuning context, which further strengthens the feature extracting process. Workload embeddings are extracted based on names of each block in the IR module. If `extract_workload` is enabled, the extracted feature vectors will have length 164 + 8 = 172. --- include/tvm/meta_schedule/feature_extractor.h | 4 +- .../feature_extractor/per_store_feature.py | 6 ++ .../feature_extractor/per_store_feature.cc | 79 ++++++++++++++++++- 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/include/tvm/meta_schedule/feature_extractor.h b/include/tvm/meta_schedule/feature_extractor.h index 02e9f26b2a600..4165e5efe0fdd 100644 --- a/include/tvm/meta_schedule/feature_extractor.h +++ b/include/tvm/meta_schedule/feature_extractor.h @@ -101,11 +101,13 @@ class FeatureExtractor : public runtime::ObjectRef { * \param arith_intensity_curve_num_samples The number of samples used in the arithmetic intensity * curve. * \param cache_line_bytes The number of bytes in a cache line. + * \param extract_workload Whether to extract features in the workload in tuning context or not. * \return The feature extractor created. */ TVM_DLL static FeatureExtractor PerStoreFeature(int buffers_per_store = 5, int arith_intensity_curve_num_samples = 10, - int cache_line_bytes = 64); + int cache_line_bytes = 64, + bool extract_workload = false); /*! * \brief Create a feature extractor with customized methods on the python-side. * \param f_extract_from The packed function of `ExtractFrom`. diff --git a/python/tvm/meta_schedule/feature_extractor/per_store_feature.py b/python/tvm/meta_schedule/feature_extractor/per_store_feature.py index 306934d5f96a3..078a4af0e37f1 100644 --- a/python/tvm/meta_schedule/feature_extractor/per_store_feature.py +++ b/python/tvm/meta_schedule/feature_extractor/per_store_feature.py @@ -35,6 +35,8 @@ class PerStoreFeature(FeatureExtractor): The number of samples used in the arithmetic intensity curve. cache_line_bytes : int The number of bytes in a cache line. + extract_workload : bool + Whether to extract features in the workload in tuning context or not. """ buffers_per_store: int @@ -43,6 +45,8 @@ class PerStoreFeature(FeatureExtractor): """The number of samples used in the arithmetic intensity curve.""" cache_line_bytes: int """The number of bytes in a cache line.""" + extract_workload: bool + """Whether to extract features in the workload in tuning context or not.""" feature_vector_length: int """Length of the feature vector.""" @@ -51,10 +55,12 @@ def __init__( buffers_per_store: int = 5, arith_intensity_curve_num_samples: int = 10, cache_line_bytes: int = 64, + extract_workload: bool = False, ): self.__init_handle_by_constructor__( _ffi_api.FeatureExtractorPerStoreFeature, # type: ignore # pylint: disable=no-member buffers_per_store, arith_intensity_curve_num_samples, cache_line_bytes, + extract_workload, ) diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc index 93f6767b11bb4..c29e5d61f0bbf 100644 --- a/src/meta_schedule/feature_extractor/per_store_feature.cc +++ b/src/meta_schedule/feature_extractor/per_store_feature.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1169,6 +1170,64 @@ struct Feature { } // namespace group5 +namespace group6 { + +/*! \brief The auxiliary feature extractor for workloads */ +class WorkloadEmbeddingExtractor : private StmtVisitor { + public: + static std::vector Extract(const IRModule& mod) { + WorkloadEmbeddingExtractor self; + for (const auto& kv : mod->functions) { + if (const PrimFuncNode* func = kv.second.as()) { + self(func->body); + } + } + return self.embedding; + } + + private: + void VisitStmt_(const BlockNode* block) final { + StmtVisitor::VisitStmt_(block); + std::string name = block->name_hint; + std::for_each(name.begin(), name.end(), [](char& c) { c = ::tolower(c); }); + if (name.find("softmax") != std::string::npos) { + embedding[0] = 1.0; + } else if ((name.find("max") != std::string::npos) || (name.find("min") != std::string::npos)) { + embedding[1] = 1.0; + } else if (name.find("add") != std::string::npos) { + embedding[2] = 1.0; + } else if (name.find("batch_matmul") != std::string::npos) { + embedding[3] = 1.0; + } else if (name.find("matmul") != std::string::npos) { + embedding[4] = 1.0; + } else if (name.find("depthwiseconv2d") != std::string::npos) { + embedding[5] = 1.0; + } else if (name.find("conv2d_winograd") != std::string::npos) { + embedding[6] = 1.0; + } else if (name.find("conv2d") != std::string::npos) { + embedding[7] = 1.0; + } + } + + std::vector embedding = std::vector(8, 0.0); +}; + +/*! \brief Group 6 feature */ +struct Feature { + explicit Feature(const IRModule& mod) { + this->feature = WorkloadEmbeddingExtractor::Extract(mod); + } + + void Export(std::vector* v) const { + v->insert(v->end(), std::begin(feature), std::end(feature)); + } + + std::vector feature; // The workload embedding + static constexpr int64_t kCount = 8; +}; + +} // namespace group6 + /*! \brief The feature extracted */ struct Feature { const BufferNode* buffer = nullptr; @@ -1178,6 +1237,7 @@ struct Feature { std::unique_ptr group3 = nullptr; std::unique_ptr group4 = nullptr; std::unique_ptr group5 = nullptr; + std::shared_ptr group6 = nullptr; bool operator<(const Feature& other) const { return buffer_order < other.buffer_order; } }; @@ -1283,6 +1343,7 @@ class PerStoreFeatureNode : public FeatureExtractorNode { int buffers_per_store; int arith_intensity_curve_num_samples; int cache_line_bytes; + bool extract_workload; int feature_vector_length; void VisitAttrs(tvm::AttrVisitor* v) { @@ -1308,7 +1369,6 @@ class PerStoreFeatureNode : public FeatureExtractorNode { feature.group3->Export(&result); feature.group4->Export(&result, feature.group5->outer_prod); feature.group5->Export(&result); - ICHECK_EQ(static_cast(result.size()), feature_vector_length); } } @@ -1317,10 +1377,19 @@ class PerStoreFeatureNode : public FeatureExtractorNode { bool is_gpu = tune_context->target.value()->kind->name == "cuda"; std::vector results; results.resize(candidates.size()); - auto f = [this, is_gpu, &candidates, &results](int, int task_id) -> void { + std::unique_ptr feature_group6 = nullptr; + if (extract_workload) { + feature_group6 = std::make_unique(tune_context->mod.value()); + } + auto f = [this, is_gpu, &feature_group6, &candidates, &results](int, int task_id) -> void { const auto& candidate = candidates[task_id]; std::vector> features; ExtractSingle(DeepCopyIRModule(candidate->sch->mod()), is_gpu, &features); + if (extract_workload) { + for (auto& feature : features) { + feature_group6->Export(&feature); + } + } results[task_id] = tir::utils::AsNDArray(features); }; support::parallel_for_dynamic(0, candidates.size(), tune_context->num_threads, f); @@ -1333,16 +1402,20 @@ class PerStoreFeatureNode : public FeatureExtractorNode { FeatureExtractor FeatureExtractor::PerStoreFeature(int buffers_per_store, int arith_intensity_curve_num_samples, - int cache_line_bytes) { + int cache_line_bytes, bool extract_workload) { ObjectPtr n = make_object(); n->buffers_per_store = buffers_per_store; n->arith_intensity_curve_num_samples = arith_intensity_curve_num_samples; n->cache_line_bytes = cache_line_bytes; + n->extract_workload = extract_workload; n->feature_vector_length = tir::group1::Feature::kCount + // tir::group2::Feature::SubFeature::kCount * buffers_per_store + // arith_intensity_curve_num_samples + // tir::group4::Feature::kCount + // tir::group5::Feature::kCount; + if (extract_workload) { + n->feature_vector_length += tir::group6::Feature::kCount; + } return FeatureExtractor(n); } From 0ae3f5d6ce6f150dd038f59429ab1da4fadea177 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 1 Jul 2022 09:02:12 -0700 Subject: [PATCH 034/111] Further clarify CI docs (#11980) --- ci/README.md | 2 +- ci/jenkins/README.md | 104 ----------------------------------------- docs/contribute/ci.rst | 26 ++++++++++- 3 files changed, 26 insertions(+), 106 deletions(-) diff --git a/ci/README.md b/ci/README.md index a5cb39016b135..38995549236c5 100644 --- a/ci/README.md +++ b/ci/README.md @@ -48,7 +48,7 @@ documentation is split like so: ## Jenkins -Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds). +Jenkins runs all of the Linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds). ## GitHub Actions diff --git a/ci/jenkins/README.md b/ci/jenkins/README.md index d2a29838b6d5c..6d42770b80963 100644 --- a/ci/jenkins/README.md +++ b/ci/jenkins/README.md @@ -34,110 +34,6 @@ GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub au https://github.com/apache/tvm/actions has the logs for each of these workflows. Note that when debugging these workflows changes from PRs from forked repositories won't be reflected in the PR. These should be tested in the forked repository first and linked in the PR body. - -## Keeping CI Green - -Developers rely on the TVM CI to get signal on their PRs before merging. Occasionally breakages -slip through and break `main`, which in turn causes the same error to show up on an unrelated PR -that is based on the broken commit(s). Broken commits can be identified [through -GitHub](https://github.com/apache/tvm/commits/main>) via the commit status icon or via -[Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>). In these -situations it is possible to either revert the offending commit or submit a forward fix to address -the issue. It is up to the committer and commit author which option to choose. A broken CI affects -all TVM developers and should be fixed as soon as possible, while a revert may be especially painful -for the author of the offending PR when that PR is large. - -Some tests are also flaky and occasionally fail for reasons unrelated to the PR. The [CI monitoring -rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and -disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix -and re-enable the test. - - -## Dealing with Flakiness - -If you notice a failure on your PR that seems unrelated to your change, you should -search [recent GitHub issues related to flaky tests](https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>) and -[file a new issue](https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>) -if you don't see any reports of the failure. If a certain test or class of tests affects -several PRs or commits on `main` with flaky failures, the test should be disabled via -[pytest's @xfail decorator](https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail) with [`strict=False`](https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter) and the relevant issue linked in the -disabling PR. - -```python -@pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234") - def test_something_flaky(): - pass -``` - -Then submit a PR as usual - -```bash -git add -git commit -m'[skip ci][ci] Disable flaky test: `` - -See # -' -gh pr create -``` - -## Network Resources - -Downloading files from the Internet in CI is a big source of flaky failures -(e.g. remote server goes down or is slow), so try to avoid using the network at -all during tests. In some cases this isn't a reasonable proposition (e.g. the -docs tutorials which need to download models). In these cases you can re-host -files in S3 for fast access in CI. A committer can upload a file, specified by -a name, hash, and path in S3, using the `workflow_dispatch` event on -[the `upload_ci_resource.yml` GitHub Actions workflow](https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml). -The sha256 must match the file or it will not be uploaded. The upload path is -user-defined so it can be any path (no trailing or leading slashes allowed) but -be careful not to collide with existing resources on accident. - -## Skipping CI - -For reverts and trivial forward fixes, adding `[skip ci]` to the revert's -PR title will cause CI to shortcut and only run lint. Committers should -take care that they only merge CI-skipped PRs to fix a failure on `main` and -not in cases where the submitter wants to shortcut CI to merge a change faster. -The PR title is checked when the build is first run (specifically during the lint -step, so changes after that has run do not affect CI and will require the job to -be re-triggered by another `git push`). - -```bash -# Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of -# the commit subject -git revert HEAD -git checkout -b my_fix -# After you have pushed your branch, create a PR as usual. -git push my_repo -# Example: Skip CI on a branch with an existing PR -# Adding this commit to an existing branch will cause a new CI run where -# Jenkins is skipped -git commit --allow-empty --message "[skip ci] Trigger skipped CI" -git push my_repo -``` - -## Docker Images - -Each CI job runs most of its work inside a Docker container, built from files -in the [`docker/`](../docker) folder. These -files are built nightly in Jenkins via the [docker-images-ci](https://ci.tlcpack.ai/job/docker-images-ci/>) job. -The images for these containers are hosted in the [tlcpack Docker Hub](https://hub.docker.com/u/tlcpack>) -and referenced in the [`Jenkinsfile.j2`](Jenkinsfile.j2). These can be inspected and run -locally via standard Docker commands. - -### `ci-docker-staging` - -The [ci-docker-staging](https://github.com/apache/tvm/tree/ci-docker-staging>) -branch is used to test updates to Docker images and `Jenkinsfile` changes. When -running a build for a normal PR from a forked repository, Jenkins uses the code -from the PR except for the `Jenkinsfile` itself, which comes from the base branch. -When branches are built, the `Jenkinsfile` in the branch is used, so a committer -with write access must push PRs to a branch in apache/tvm to properly test -`Jenkinsfile` changes. If your PR makes changes to the `Jenkinsfile`, make sure -to @ a [committer](../CONTRIBUTORS.md>) -and ask them to push your PR as a branch to test the changes. - # Jenkins CI TVM uses Jenkins for running Linux continuous integration (CI) tests on diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst index 9a2876220fc7e..a421103ab4577 100644 --- a/docs/contribute/ci.rst +++ b/docs/contribute/ci.rst @@ -31,7 +31,7 @@ Jenkins is the only CI step that is codified to block merging. TVM is also teste against Windows and MacOS using GitHub Actions. This page describes how contributors and committers can use TVM's CI to verify their code. You can -read more about the design of TVM CI in the +read more about the design of TVM CI in the `tlc-pack/ci `_ repo. For Contributors ---------------- @@ -164,6 +164,30 @@ be re-triggered by another ``git push``). git push my_repo +Docker Images +^^^^^^^^^^^^^ + +Each CI job runs most of its work inside a Docker container, built from files +in the `docker/ `_ folder. These +files are built nightly in Jenkins via the `docker-images-ci `_ job. +The images for these containers are hosted in the `tlcpack Docker Hub `_ +and referenced in the `Jenkinsfile.j2 `_. These can be inspected and run +locally via standard Docker commands. + + +``ci-docker-staging`` +^^^^^^^^^^^^^^^^^^^^^ + +The `ci-docker-staging `_ +branch is typically used to test updates to Docker images and ``Jenkinsfile`` changes. When +running a build for a normal PR from a forked repository, Jenkins uses the code +from the PR except for the ``Jenkinsfile`` itself, which comes from the base branch. +When branches are built, the ``Jenkinsfile`` in the branch is used, so a committer +with write access must push PRs to a branch in apache/tvm to properly test +``Jenkinsfile`` changes. If your PR makes changes to the ``Jenkinsfile``, make sure +to @ a `committer `_ +and ask them to push your PR as a branch to test the changes. + CI Monitoring Rotation ^^^^^^^^^^^^^^^^^^^^^^ From 9e14509cabf9e6ba674d819d36b5d29f97f3dc2f Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 1 Jul 2022 09:03:14 -0700 Subject: [PATCH 035/111] [docker] Fall back to tlcpackstaging in bash.sh (#11976) This uses #11775 to make local builds work if they're run in the meantime before CI tags over a new image to tlcpack Co-authored-by: driazati --- docker/bash.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docker/bash.sh b/docker/bash.sh index 18c655d2ddc5e..56efa1d045510 100755 --- a/docker/bash.sh +++ b/docker/bash.sh @@ -290,7 +290,15 @@ DOCKER_DEVICES=( ) # If the user gave a shortcut defined in the Jenkinsfile, use it. EXPANDED_SHORTCUT=$(lookup_image_spec "${DOCKER_IMAGE_NAME}") if [ -n "${EXPANDED_SHORTCUT}" ]; then - DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}" + if [ "${CI+x}" == "true" ]; then + DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}" + else + python tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null + DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME") + if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then + echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2 + fi + fi fi # Set up working directories From 2f8b008730e5fd97eccc14e58a5a259ae30bac38 Mon Sep 17 00:00:00 2001 From: xndcn Date: Sat, 2 Jul 2022 00:06:51 +0800 Subject: [PATCH 036/111] [tests] Fix changed var name from 'target_str' to 'target_names', NFC (#11982) --- python/tvm/testing/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 054257e07aa29..d7c2adaa8606c 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -434,14 +434,14 @@ def _get_targets(target_names=None): logging.warning( "None of the following targets are supported by this build of TVM: %s." " Try setting TVM_TEST_TARGETS to a supported target. Defaulting to llvm.", - target_str, + target_names, ) return _get_targets(["llvm"]) raise TVMError( "None of the following targets are supported by this build of TVM: %s." " Try setting TVM_TEST_TARGETS to a supported target." - " Cannot default to llvm, as it is not enabled." % target_str + " Cannot default to llvm, as it is not enabled." % target_names ) return targets From c97895e0ffb512e73c89de7cdee9846f052244fc Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 1 Jul 2022 14:00:43 -0500 Subject: [PATCH 037/111] [Hexagon] Fix use of subprocess.run in _check_call_verbose (#11985) It uses parameters that are not present in Python 3.6, plus it catches generic exception, which may not have `stdout` or `stderr` members. --- python/tvm/contrib/hexagon/build.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py index 080b9828777ac..fe7434f7386d3 100644 --- a/python/tvm/contrib/hexagon/build.py +++ b/python/tvm/contrib/hexagon/build.py @@ -47,8 +47,15 @@ def _check_call_verbose(cmd, **kwargs) -> None: the stdout/stderr provided by the subprocess. """ try: - subprocess.run(cmd, capture_output=True, check=True, text=True, **kwargs) - except Exception as err: + subprocess.run( + cmd, + check=True, + encoding="UTF-8", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **kwargs, + ) + except subprocess.CalledProcessError as err: error_msg = f"{err}\nstdout:\n{err.stdout}\nstderr:\n{err.stderr}" raise Exception(error_msg) From 50cd4d635cb0947e90d5d8ecdd94baeabf57ab31 Mon Sep 17 00:00:00 2001 From: joshherr-quic <95375797+joshherr-quic@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:50:35 -0500 Subject: [PATCH 038/111] [Hexagon] Enable int8 vlut codegen for Relay take (LUT) operator (#11693) * Working 8 bit vlut for relay take operator * Formatting * More formatting * clang-format on codegen_hexagon.cc * Update for llvm api * Add return to VisitExpr(BufferLoadNode) function * different llvm api --- python/tvm/topi/hexagon/injective.py | 6 ++ src/target/llvm/codegen_hexagon.cc | 147 +++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py index 34a9fb9a05e54..9ced0ac7d3992 100644 --- a/python/tvm/topi/hexagon/injective.py +++ b/python/tvm/topi/hexagon/injective.py @@ -37,6 +37,12 @@ def schedule_injective(outs): outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs s = tvm.te.create_schedule([x.op for x in outs]) tvm.te.schedule.AutoInlineInjective(s) + + # Fuse axes and vectorize inner 128 elements + for x in outs: + fused = s[x].fuse(*x.op.axis) + _, inner = s[x].split(fused, factor=128) + s[x].vectorize(inner) return s diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc index a195c9f05453b..7b0081869a27f 100644 --- a/src/target/llvm/codegen_hexagon.cc +++ b/src/target/llvm/codegen_hexagon.cc @@ -74,8 +74,19 @@ class CodeGenHexagon final : public CodeGenCPU { bool system_lib, bool dynamic_lookup, bool target_c_runtime) override; void InitTarget(llvm::TargetMachine* tm) final; + using CodeGenCPU::VisitStmt_; + llvm::Value* VisitExpr_(const BufferLoadNode* op) override; + llvm::Module* GetModulePtr() const { return module_.get(); } + uint64_t GetTypeSizeInBits(llvm::Type* type) const { +#if TVM_LLVM_VERSION >= 100 + return data_layout_->getTypeSizeInBits(type).getFixedSize(); +#else + return data_layout_->getTypeSizeInBits(type); +#endif + } + protected: void CreatePrintf(const std::string& format, llvm::ArrayRef format_args) final; @@ -86,6 +97,9 @@ class CodeGenHexagon final : public CodeGenCPU { llvm::GlobalVariable* InitContextPtr(llvm::Type* type, std::string name); llvm::Value* GetContextPtr(llvm::GlobalVariable* gv); + + llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, Array index); + llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef args); }; void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm, @@ -281,6 +295,139 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::V return TypedPointer(); } +llvm::Value* CodeGenHexagon::Intrinsic(llvm::Intrinsic::ID IntID, + llvm::ArrayRef args) { + llvm::Function* intf = llvm::Intrinsic::getDeclaration(module_.get(), IntID); +#if TVM_LLVM_VERSION >= 90 + auto intf_callee = llvm::FunctionCallee(intf); +#else + auto intf_callee = intf; +#endif + std::vector conv_args; + llvm::FunctionType* intf_type = intf->getFunctionType(); + ICHECK(args.size() == intf_type->getNumParams()); + + for (int i = 0, e = args.size(); i != e; ++i) { + llvm::Value* arg = args[i]; + auto* need_type = llvm::dyn_cast(intf_type->getParamType(i)); + auto* have_type = llvm::dyn_cast(arg->getType()); + if (need_type != nullptr && have_type != nullptr && need_type != have_type) { + int need_width = GetTypeSizeInBits(need_type); + int have_width = GetTypeSizeInBits(have_type); + if (need_width == have_width) { + if (need_width == native_vector_bits_ || need_width == 2 * native_vector_bits_) { + arg = builder_->CreateBitCast(arg, need_type); + } + } // TODO(joshherr-quic): add handling of v128i1 <-> v1024i1 + } + conv_args.push_back(arg); + } + return builder_->CreateCall(intf_callee, conv_args); +} + +llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) { + if (!op->buffer.same_as(op->buffer->data)) { + // Check if we can generate a vector lookup. + if (!op->indices[0].as()) { + if (auto* vlut = VectorLookupLoad(op->buffer, op->dtype, op->indices)) { + return vlut; + } + } + } + return CodeGenLLVM::VisitExpr_(op); +} + +llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_type, + Array indices) { + PrimExpr index = indices[0]; + if (!index.dtype().is_vector()) { + return nullptr; + } + + if (buffer_type.bits() != 8) return nullptr; + + int table_elem_count = arith::Analyzer().Simplify(buffer->shape[0]).as()->value; + if (table_elem_count <= 0 || table_elem_count > 256) return nullptr; + + auto int32 = DataType::Int(32); + auto native_vector_bytes = native_vector_bits_ / 8; + + // Indexes + llvm::Value* trunc = MakeValue(Cast(index.dtype().with_bits(8), index)); + llvm::Value* index_pad = CreateVecPad(trunc, native_vector_bytes); + + // Values + std::vector vloads; + DataType table_type = buffer_type.with_lanes(table_elem_count); + + auto table_all = + MakeValue(BufferLoad(buffer, { + Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count), + })); + + // The number of value vectors should be a power of 2. + int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes); + int table_vec_length = native_vector_bytes / buffer_type.bytes(); + for (int i = 0; i != table_vec_count; ++i) { + // CreateVecSlice will generate undefs for elements outside the source vector. + vloads.push_back(CreateVecSlice(table_all, i * table_vec_length, table_vec_length)); + } + +#define VLO(x) Intrinsic(llvm::Intrinsic::hexagon_V6_lo_128B, {x}) +#define VHI(x) Intrinsic(llvm::Intrinsic::hexagon_V6_hi_128B, {x}) +#define VXOR(x, y) Intrinsic(llvm::Intrinsic::hexagon_V6_vxor_128B, {x, y}) +#define VSHUFF(x) Intrinsic(llvm::Intrinsic::hexagon_V6_vshuffb_128B, {x}) +#define VSPLATB(x) Intrinsic(llvm::Intrinsic::hexagon_V6_lvsplatb_128B, {x}) +#define VLUT32(x, y, z) Intrinsic(llvm::Intrinsic::hexagon_V6_vlutvvbi_128B, {x, y, z}) +#define VLUT32_OR(v, x, y, z) \ + Intrinsic(llvm::Intrinsic::hexagon_V6_vlutvvb_oracci_128B, {v, x, y, z}) + + // Shuffle table bytes: + // 127, 63, 126, 62,........68, 4, 67, 3, 66, 2, 65, 1, 64, 0 + std::vector table; + for (int i = 0; i != table_vec_count; ++i) table.push_back(VSHUFF(vloads[i])); + + // Get each 32 byte sub-table's output + std::vector results; + int table_iters = table_elem_count / 32; + for (int i = 0; i < table_iters; ++i) + results.push_back(VLUT32(index_pad, table[i / 4], ConstInt32(i % 8))); + + // Combine outputs + llvm::Value* result = results[0]; + for (int i = 1; i < table_iters; ++i) result = VXOR(result, results[i]); + + llvm::Type* res_type = result->getType(); + llvm::Type* ret_type = DTypeToLLVMType(buffer_type); + if (res_type == ret_type) { + return result; + } + + int res_bits = GetTypeSizeInBits(res_type); + int ret_bits = GetTypeSizeInBits(ret_type); + ICHECK_GE(res_bits, ret_bits); + if (ret_bits < res_bits) { +#if TVM_LLVM_VERSION >= 110 + llvm::Type* res_byte_type = llvm::VectorType::get(t_int8_, res_bits / 8, /*Scalable*/ false); +#else + llvm::Type* res_byte_type = llvm::VectorType::get(t_int8_, res_bits / 8); +#endif + result = CreateVecSlice(builder_->CreateBitCast(result, res_byte_type), 0, ret_bits / 8); + } + if (result->getType() != ret_type) { + return builder_->CreateBitCast(result, ret_type); + } + return result; + +#undef VLUT32_OR +#undef VLUT32 +#undef VSPLATB +#undef VSHUFF +#undef VXOR +#undef VHI +#undef VLO +} + namespace { DMLC_ATTRIBUTE_UNUSED std::ostream& operator<<(std::ostream& os, const llvm::Module& m) { std::string ms; From eeb5fce7a147bd7097b7089c600166ae1bfa7ef7 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Fri, 1 Jul 2022 14:41:44 -0700 Subject: [PATCH 039/111] Couple patches to docker/bash.sh after #11976. (#11988) * Use python3 to run determine_docker_images.py * Properly detect presence of CI env var with + expansion. --- docker/bash.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docker/bash.sh b/docker/bash.sh index 56efa1d045510..7d649bff1a620 100755 --- a/docker/bash.sh +++ b/docker/bash.sh @@ -35,7 +35,6 @@ set -euo pipefail - function show_usage() { cat < /dev/null + python3 tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME") if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2 From 55dcd5f195b15401b79f280d4146ddbf75806b7c Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:45:02 -0700 Subject: [PATCH 040/111] [ci] Don't skip index-triggered builds (#11915) This code was there to stop Jenkins restarts from doing a repository scan and scheduling a ton of builds. However, I haven't noticed this happening during restarts lately, and repository scans are useful to patch up PRs that didn't get CI run properly (i.e. while Jenkins was down or something). For example in #11914 since this code is there all the messed up PRs needed their CI to be manually re-triggered even though they were detected during the scan. --- Jenkinsfile | 10 +--------- ci/jenkins/Jenkinsfile.j2 | 8 -------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 513f11eaaf535..8b59fe219248d 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-06-27T17:30:37.779354 +// Generated at 2022-07-01T12:43:52.727636 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -86,14 +86,6 @@ docker_build = 'docker/build.sh' max_time = 180 rebuild_docker_images = false -// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/ -// execute this before anything else, including requesting any time on an agent -if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { - print "INFO: Build skipped due to trigger being Branch Indexing" - currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful - return -} - // Filenames for stashing between build and test steps s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}" diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 index 22cd5b6bfc845..2c38bf32c6c1d 100644 --- a/ci/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -83,14 +83,6 @@ docker_build = 'docker/build.sh' max_time = 180 rebuild_docker_images = false -// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/ -// execute this before anything else, including requesting any time on an agent -if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { - print "INFO: Build skipped due to trigger being Branch Indexing" - currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful - return -} - // Filenames for stashing between build and test steps {% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %} {% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %} From d2a14a6880ee2e4520f3e9a55accb258e8725e65 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:09:06 -0700 Subject: [PATCH 041/111] [BYOC] Switch TensorRT BYOC integration to IRModule-at-a-time using RelayToTIR hook (#11979) * [BYOC] Switch TensorRT BYOC integration to IRModule-at-a-time using RelayToTIR hook This does for the TensorRT integration what #11631 did for the CUTLASS integration. - All compilation options are captured within the attributes of a Target of kind "tensorrt" (instead of the "relay.ext.tensorrt.options" attribute in PassContext). This means all BYOC configurations options needed by Collage can be captured uniformly by a list-of-Targets. It also means RPC boundaries (as used internally at OctoML) only need to worry about maintaining the fidelity of the Target instance(s) rather than reaching into the PassContext. - Compilation is switched from function-at-a-time (relying on the TECompiler) to IRModule-at-a-time (using the RelayToTIR target-specific hook mechanism). Though not strictly necessary for Collage I want to check the path is now clear to deprecate the support for BYOC in TEComplier. - Get all the TensorRT tests going again, except for a few I've disabled with x-link to a new issue #11765. CAUTION: The TensorRT runtime is not supported in CI so many of these tests are cosmetic. - While trying to track down a 'free(): invalid pointer' error in test_tensorrt_int8_exp.py made the TensorRT allocs/frees more robust, but turns out its also broken in main. No harm leaving these changes in though. * - Lints * - Woops, fix test * - lints * - Use default tensorrt target if none given in targets list * - fix free error * - accidentally introduced 'transforms' namespace - can't use default Target("tensorrt") arg * - D'oh! Include ended up #if protected * - restore mark for test_dynamic_offload - handle missing runtime in versioning - turn test_maskrcnn_resnet50 back on now that we have the import-torch-first workaround. * - wibble --- include/tvm/runtime/module.h | 2 +- .../testing/custom_builder_runner.py | 7 +- python/tvm/relay/op/contrib/tensorrt.py | 191 +++++++------ .../backend/contrib/codegen_c/codegen.cc | 10 +- src/relay/backend/contrib/cutlass/codegen.cc | 10 +- src/relay/backend/contrib/tensorrt/codegen.cc | 265 +++++++++++------- src/relay/backend/contrib/tensorrt/codegen.h | 47 ++++ src/relay/backend/contrib/tensorrt/target.cc | 31 +- .../transforms/compiler_function_utils.cc | 16 +- .../transforms/compiler_function_utils.h | 15 +- src/runtime/const_loader_module.cc | 24 +- src/runtime/contrib/json/json_runtime.h | 2 + .../contrib/tensorrt/tensorrt_builder.cc | 27 +- .../contrib/tensorrt/tensorrt_builder.h | 10 +- src/runtime/contrib/tensorrt/tensorrt_ops.cc | 4 +- .../contrib/tensorrt/tensorrt_runtime.cc | 14 +- src/target/metadata_module.cc | 2 - tests/python/contrib/test_tensorrt.py | 172 ++++++------ .../python/contrib/test_tensorrt_int8_exp.py | 23 +- 19 files changed, 524 insertions(+), 348 deletions(-) create mode 100644 src/relay/backend/contrib/tensorrt/codegen.h diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index 31d05571eefd2..9d139c9feff3d 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -113,7 +113,7 @@ class Module : public ObjectRef { class TVM_DLL ModuleNode : public Object { public: /*! \brief virtual destructor */ - virtual ~ModuleNode() {} + virtual ~ModuleNode() = default; /*! * \return The per module type key. * \note This key is used to for serializing custom modules. diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py index e203848c2cbb8..1cfd4ab833be5 100644 --- a/python/tvm/meta_schedule/testing/custom_builder_runner.py +++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py @@ -85,11 +85,8 @@ def build_relay_with_tensorrt( from tvm.relay.op.contrib import tensorrt from tvm.runtime import Module - mod, config = tensorrt.partition_for_tensorrt(mod, params) - with PassContext( - opt_level=3, - config={"relay.ext.tensorrt.options": config}, - ): + mod = tensorrt.partition_for_tensorrt(mod, params) + with PassContext(opt_level=3): result = relay_build(mod, target=target, target_host=None, params=params) assert isinstance(result, Module) return result diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index a69e2d4105290..4008b0eb3f78f 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -33,6 +33,10 @@ logger = logging.getLogger("TensorRT") +def is_tensorrt_compiler_enabled() -> bool: + return tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) is not None + + def is_tensorrt_runtime_enabled() -> bool: """Check if the TensorRT graph executor is present. Returns @@ -40,118 +44,105 @@ def is_tensorrt_runtime_enabled() -> bool: ret: bool True if present, False if not. """ - check_enabled = tvm.get_global_func("relay.op.is_tensorrt_runtime_enabled", True) + check_enabled = tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) if check_enabled: return check_enabled() return False +def get_tensorrt_target() -> tvm.target.Target: + """Returns the current Target, which must be of kind "tensorrt".""" + target = tvm.target.Target.current() + if target is None or target.kind.name != "tensorrt": + # Create the default target. + return tvm.target.Target("tensorrt") + return target + + def get_tensorrt_version() -> Tuple[int, int, int]: - """Gets the version of TensorRT that TVM is built against or is targeting. + """Returns the version of TensorRT to assume during compilation. + In order of preference this is taken from: + - The current "tensorrt" target's "tensorrt_version" attribute string. + - The version linked to the TVM runtime. + - (6, 0, 1) Returns ------- ret: Tuple[int, int, int] - TensorRT version as a tuple of major, minor, and patch number. If TVM - is not built with TensorRT, the value set by set_tensorrt_version() is returned instead. + TensorRT version as a tuple of (major, minor, patch). """ - pass_ctx = tvm.transform.PassContext.current() - if "relay.ext.tensorrt.options" in pass_ctx.config: - return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version) # type: ignore - return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")()) # type: ignore + # cf logic in tensorrt/codegen.cc::SaveGlobalAttributes + # First check for version in target. + target = get_tensorrt_target() + version = target.attrs["tensorrt_version"] + if len(version) == 3: + return int(version[0]), int(version[1]), int(version[2]) + assert len(version) == 0 + + # Next, ask runtime for its version. + if is_tensorrt_runtime_enabled(): + get_version = tvm.get_global_func("relay.ext.tensorrt.get_version") + version = get_version() + assert len(version) == 3 + return int(version[0]), int(version[1]), int(version[2]) + + # Finally, use default. + logger.warning( + "TVM was not built against TensorRT and no version was provided in the 'tensorrt' target." + "Defaulting to 6.0.1." + ) + return (6, 0, 1) def get_tensorrt_use_implicit_batch_mode() -> bool: - pass_ctx = tvm.transform.PassContext.current() - if "relay.ext.tensorrt.options" in pass_ctx.config: - return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch - logger.warning( - "PassContext has no relay.ext.tensorrt.options config, using default value " - "use_implicit_batch=True." - ) - return True + """Returns the "use_implicit_batch" attribute of the current "tensorrt" target.""" + target = get_tensorrt_target() + return target.attrs["use_implicit_batch"] def get_tensorrt_remove_no_mac_subgraphs() -> bool: - pass_ctx = tvm.transform.PassContext.current() - if "relay.ext.tensorrt.options" in pass_ctx.config: - return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs - logger.warning( - "PassContext has no relay.ext.tensorrt.options config, using default value " - "remove_no_mac_subgraphs=False." - ) - return False + """Returns the "remove_no_mac_subgraphs" attribute of the current "tensorrt" target.""" + target = get_tensorrt_target() + return target.attrs["remove_no_mac_subgraphs"] + + +def get_tensorrt_use_fp16() -> bool: + """Returns the "use_fp16" attribute of the current "tensorrt" target.""" + target = get_tensorrt_target() + return target.attrs["use_fp16"] def partition_for_tensorrt( mod: tvm.IRModule, params: Optional[Dict[str, tvm.nd.NDArray]] = None, - version: Optional[Tuple[int, int, int]] = None, - use_implicit_batch: bool = True, - remove_no_mac_subgraphs: bool = False, - max_workspace_size: int = 1 << 30, - use_fp16: bool = False, - use_uint8: bool = False, -) -> Tuple[tvm.IRModule, Dict[str, Any]]: - """Partition the graph greedily offloading supported operators to TensorRT. + # CAUTION: Can't use default Target("tensorrt") here since the target kind is only available + # if is_tensorrt_compiler_enabled() == True. + target: Optional[tvm.target.Target] = None, +) -> tvm.IRModule: + """Partition all functions in mod to greedily offload supported operators to TensorRT. Parameters ---------- mod : tvm.IRModule - The module to run passes on. + The module to partition. + target : tvm.target.Target + A target of kind "tensorrt" describing additional partitioning and compilation options. params : Optional[Dict[str, tvm.nd.NDArray]] Constant input parameters. - version : Optional[Tuple[int, int, int]] - TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with - USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead. - use_implicit_batch : bool - Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch - mode which will widen supported operators to include those which modify the batch dimension, - but may reduce performance for some models. - remove_no_mac_subgraphs : bool - Removes subgraphs which have been partitioned for TensorRT if they do not have any - multiply-accumulate operations. The removed subgraphs will go through TVM's standard - compilation instead. Can improve performance. - max_workspace_size : int - How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. - See TensorRT documentation for more info. - use_fp16: bool - Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled - if FP16 inputs tensors and weights are used. - Note that TensorRT will still choose a higher-precision kernel if it results in overall - lower runtime, or if no low-precision implementation exists. - use_uint8: bool - Allows, TRT to automatically convert FP32 inputs to UINT8. Returns ------- - mod_and_config : Tuple[tvm.IRModule, Dict[str, Any]] - A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options" - configuration which should be given to PassContext when building. + partitioned_mod : tvm.IRModule + The partitioned module. """ - config: Dict[str, Any] = { - "use_implicit_batch": use_implicit_batch, - "max_workspace_size": max_workspace_size, - "remove_no_mac_subgraphs": remove_no_mac_subgraphs, - "use_fp16": use_fp16, - "use_uint8": use_uint8, - } - if version: - assert isinstance(version, tuple) and len(version) == 3 - config["tensorrt_version"] = version - else: - linked_version = tuple(tvm.get_global_func("relay.op.get_tensorrt_version")()) - if not linked_version: - logger.warning( - "TVM was not built against TensorRT and no version was provided to " - "partition_for_tensorrt. Defaulting to 6.0.1" - ) - linked_version = (6, 0, 1) - config["tensorrt_version"] = linked_version - + assert is_tensorrt_compiler_enabled(), "Can only partition for TensorRT if it is enabled" if params: mod["main"] = bind_params_by_name(mod["main"], params) + if target is None: + # Use a default target. The get_tensorrt_target() function will similarly create an + # equivalent default target when compilation continues after partitioning. + target = tvm.target.Target("tensorrt") seq = tvm.transform.Sequential( [ @@ -174,24 +165,27 @@ def partition_for_tensorrt( transform.InferType(), ] ) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + with target: mod = seq(mod) - # TODO(mbs): Revisit - # mod = prune_tensorrt_subgraphs(mod) - return mod, config + mod = prune_tensorrt_subgraphs(mod) + return mod def is_supported_trt_type(typ: Union[tvm.ir.TensorType, tvm.ir.TupleType], op_name: str) -> bool: """Check whether a type is supported by TensorRT.""" - supported_dtypes = ["float32", "float16"] + supported_dtypes = ["float32"] + if get_tensorrt_use_fp16(): + supported_dtypes.append("float16") if isinstance(typ, tvm.ir.TensorType): if typ.dtype not in supported_dtypes: - logger.info(f"{op_name}: Only float32 and float16 tensor dtypes are supported.") + logger.info(f"{op_name}: Only {supported_dtypes} tensor dtypes are supported.") return False - # assumes dim 0 is for batch and can be dynamic - # TODO(mbs): But does this depend use_implicit_batch flag? - for dim_shape in typ.shape[1:]: - if isinstance(dim_shape, tvm.tir.expr.Any): + dims = typ.shape + if get_tensorrt_use_implicit_batch_mode(): + # The first dimension can be Any. + dims = dims[1:] + for dim in dims: + if isinstance(dim, tvm.tir.expr.Any): logger.info(f"{op_name}: Only statically known tensor shapes are supported.") return False elif isinstance(typ, tvm.ir.TupleType): @@ -241,13 +235,19 @@ def get_attrs(expr: relay.expr.Expr) -> Any: def make_predicate(checker: CheckFunc) -> Callable[[relay.expr.Expr], bool]: + """Returns the pattern predicate which performs the standard checks, then invokes the + more primitive checker.""" + def predicate(expr: relay.expr.Expr) -> bool: op_name = get_op_name(expr) attrs = get_attrs(expr) args = get_args(expr) if not all([is_supported_trt_type(arg.checked_type, op_name) for arg in args]): return False - return checker(attrs, args, op_name) + if not checker(attrs, args, op_name): + return False + logger.info(f"{op_name}: Predicate passes") + return True return predicate @@ -535,11 +535,16 @@ def concatenate_checker( if int(attrs.axis) == 0: logger.info(f"{op_name}: can't modify batch dimension.") return False - if isinstance(args[0], relay.Tuple): - for tuple_input in args[0].fields: - if isinstance(tuple_input, Constant): - logger.info(f"{op_name}: can't concatenate tensors with constants.") - return False + + if not isinstance(args[0], relay.Tuple): + logger.info("f{op_name}: concatenate must be applied to a literal tuple") + return False + + for tuple_input in args[0].fields: + if isinstance(tuple_input, Constant): + logger.info(f"{op_name}: can't concatenate tensors with constants.") + return False + return True diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc index ee8724fe92fe3..41f0a0a06408d 100644 --- a/src/relay/backend/contrib/codegen_c/codegen.cc +++ b/src/relay/backend/contrib/codegen_c/codegen.cc @@ -360,8 +360,8 @@ class CodegenCModule { }; /*! \brief The actual translation pass. */ -transform::Pass CCompilerImpl() { - auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) { +tvm::transform::Pass CCompilerImpl() { + auto pass_func = [=](IRModule mod, const tvm::transform::PassContext& pass_ctx) { VLOG(1) << "CCompilerImpl input:" << std::endl << PrettyPrint(mod); Target target = GetCCompilerTarget(); @@ -388,10 +388,10 @@ transform::Pass CCompilerImpl() { return tvm::transform::CreateModulePass(pass_func, 0, "CCompilerImpl", {}); } -transform::Pass CCompilerPass() { +tvm::transform::Pass CCompilerPass() { return transform::Sequential( - {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("ccompiler"), CCompilerImpl(), - transforms::MarkCompilerFunctionsAsExtern("ccompiler")}); + {transform::OutlineCompilerFunctionsWithExistingGlobalSymbols("ccompiler"), CCompilerImpl(), + transform::MarkCompilerFunctionsAsExtern("ccompiler")}); } } // namespace contrib diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc index de2934173b5ff..2e76ab1cbbf64 100644 --- a/src/relay/backend/contrib/cutlass/codegen.cc +++ b/src/relay/backend/contrib/cutlass/codegen.cc @@ -902,8 +902,8 @@ class CutlassModuleCodegen { * \brief A small shim to redirect to the 'relay.ext.cutlass.compile_for_cutlass' Python * function which does the main CUTLASS training, c-code generation and compilation steps. */ -transform::Pass CompileForCutlassImpl() { - auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) { +tvm::transform::Pass CompileForCutlassImpl() { + auto pass_func = [=](IRModule mod, const tvm::transform::PassContext& pass_ctx) { VLOG(1) << "CompileForCutlass input:" << std::endl << PrettyPrint(mod); const auto* pf = runtime::Registry::Get("relay.ext.cutlass.compile_for_cutlass"); ICHECK(pf != nullptr) << "Cannot find compile_for_cutlass function"; @@ -926,10 +926,10 @@ runtime::Module CreateCSourceModule(const IRModule& mod) { TVM_REGISTER_GLOBAL("relay.ext.cutlass.create_c_source_module").set_body_typed(CreateCSourceModule); -transform::Pass CompileForCutlass() { +tvm::transform::Pass CompileForCutlass() { return transform::Sequential( - {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("cutlass"), - CompileForCutlassImpl(), transforms::MarkCompilerFunctionsAsExtern("cutlass")}); + {transform::OutlineCompilerFunctionsWithExistingGlobalSymbols("cutlass"), + CompileForCutlassImpl(), transform::MarkCompilerFunctionsAsExtern("cutlass")}); } } // namespace cutlass diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index e08cd240d4d1e..1c4a8d78062e7 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -29,6 +29,7 @@ #include #include +#include "../../../transforms/compiler_function_utils.h" #include "../../utils.h" #include "../codegen_json/codegen_json.h" @@ -39,36 +40,49 @@ namespace tvm { namespace relay { namespace contrib { +namespace tensorrt { -/*! \brief Attributes to store the compiler options for TensorRT. */ -struct TensorRTCompilerConfigNode : public tvm::AttrsNode { - Array tensorrt_version; - bool use_implicit_batch; - size_t max_workspace_size; - bool remove_no_mac_subgraphs; - bool use_fp16; - bool use_uint8; - - TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") { - TVM_ATTR_FIELD(tensorrt_version) - .describe("TensorRT version as (major, minor, patch).") - .set_default(Array({6, 0, 1})); - TVM_ATTR_FIELD(use_implicit_batch).set_default(true); - TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30); - TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false); - TVM_ATTR_FIELD(use_fp16).set_default(false); - TVM_ATTR_FIELD(use_uint8).set_default(false); - } -}; +/*! + * \brief Check whether TensorRT graph executor is enabled. + * \return True if enabled, False if not. + */ +inline constexpr bool IsRuntimeEnabled() { +#if TVM_GRAPH_EXECUTOR_TENSORRT + return true; +#else + return false; +#endif // TVM_GRAPH_EXECUTOR_TENSORRT +} -class TensorRTCompilerConfig : public Attrs { - public: - TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs, - TensorRTCompilerConfigNode); -}; +TVM_REGISTER_GLOBAL("relay.ext.tensorrt.is_runtime_enabled").set_body_typed(IsRuntimeEnabled); -TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode); -TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig); +/*! + * \brief Get TensorRT version that TVM is built against. + * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph + * runtime is not enabled. + */ +Array GetVersion() { +#if TVM_GRAPH_EXECUTOR_TENSORRT + return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)}; +#else + return {}; +#endif // TVM_GRAPH_EXECUTOR_TENSORRT +} + +TVM_REGISTER_GLOBAL("relay.ext.tensorrt.get_version").set_body_typed(GetVersion); + +/*! + * \brief Returns the "tensorrt" Target instance to use for compilation. + */ +Target GetTensorRTTarget() { + Target target = Target::Current(/*allow_not_defined=*/true); + if (!target.defined() || target->kind->name != "tensorrt") { + // Since we allow partition_for_tensorrt to use the default "tensorrt" target, we should + // similarly allow the custom pass to execute without a specific "tensorrt" target in scope. + target = Target("tensorrt"); + } + return target; +} using JSONGraphNode = tvm::runtime::json::JSONGraphNode; using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; @@ -87,6 +101,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor { explicit CollectFromCompositeFunctionBody(TensorRTJSONSerializer* serializer) : serializer_(serializer), node_(std::make_shared()) {} + // We'll need to implement these out-of-band since they use the serializer. void VisitExpr_(const ConstantNode* constant_node) final; void VisitExpr_(const CallNode* call_node) final; @@ -190,6 +205,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor { extractor.Extract(const_cast(attr_obj)); } + /*! \brief The parent serializer for the overall TensorRT partition. */ TensorRTJSONSerializer* serializer_; /*! \brief Accumulated translated arguments. */ std::vector args_; @@ -207,9 +223,10 @@ class CollectFromCompositeFunctionBody : public ExprVisitor { */ class TensorRTJSONSerializer : public JSONSerializer { public: - TensorRTJSONSerializer(const std::string& symbol, const Expr& expr) - : JSONSerializer(symbol, expr) {} + TensorRTJSONSerializer(Target target, const std::string& symbol, const Expr& expr) + : JSONSerializer(symbol, expr), target_(std::move(target)) {} + private: using JSONSerializer::VisitExpr_; std::vector VisitExpr_(const CallNode* call_node) final { @@ -245,40 +262,62 @@ class TensorRTJSONSerializer : public JSONSerializer { node->CaptureAttrs(*collector.node_); // Capture global settings on the JSON node. - SaveGlobalAttributes(node); + // TODO(mbs): Why on every call? + SaveGlobalAttributes(node.get()); VLOG(1) << name << " has " << node->GetInputs().size() << " inputs"; return AddNode(node, GetRef(call_node)); } - static void SaveGlobalAttributes(std::shared_ptr node) { - auto ctx = transform::PassContext::Current(); - auto cfg = ctx->GetConfig("relay.ext.tensorrt.options"); - if (!cfg.defined()) { - cfg = AttrsWithDefaultValues(); + static void SetAttr(JSONGraphNode* node, const std::string& key, + std::vector values) { + node->SetAttr(key, std::vector({std::move(values)})); + } + + /*! \brief Capture the compilation options as attributes on \p node. */ + void SaveGlobalAttributes(JSONGraphNode* node) { + { + // cf logic in tensorrt.py::get_tensorrt_version. + // First check for version in target. + Array target_attr = target_->GetAttr>("tensorrt_version").value(); + if (target_attr.empty()) { + // Next, ask runtime for its version. + target_attr = GetVersion(); + } + if (target_attr.empty()) { + // Finally, use default. + target_attr = {6, 0, 1}; + } + ICHECK_EQ(target_attr.size(), 3); + SetAttr(node, "tensorrt_version", + {std::to_string(target_attr[0]), std::to_string(target_attr[1]), + std::to_string(target_attr[2])}); + } + + { + Bool target_attr = target_->GetAttr("use_implicit_batch").value(); + SetAttr(node, "use_implicit_batch", {std::to_string(target_attr->value)}); + } + + { + Integer target_attr = target_->GetAttr("max_workspace_size").value(); + SetAttr(node, "max_workspace_size", {std::to_string(target_attr->value)}); + } + + { + Bool target_attr = target_->GetAttr("use_fp16").value(); + SetAttr(node, "use_fp16", {std::to_string(target_attr->value)}); + } + + { + Bool target_attr = target_->GetAttr("use_uint8").value(); + SetAttr(node, "use_uint8", {std::to_string(target_attr->value)}); } - ICHECK_EQ(cfg.value()->tensorrt_version.size(), 3); - std::vector tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]), - std::to_string(cfg.value()->tensorrt_version[1]), - std::to_string(cfg.value()->tensorrt_version[2])}; - std::vector use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)}; - std::vector max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)}; - std::vector use_fp16 = {std::to_string(cfg.value()->use_fp16)}; - std::vector use_uint8 = {std::to_string(cfg.value()->use_uint8)}; - std::vector tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr, - use_fp16_attr, use_uint8_attr; - tensorrt_version_attr.emplace_back(tensorrt_version); - use_implicit_batch_attr.emplace_back(use_implicit_batch); - max_workspace_size_attr.emplace_back(max_workspace_size); - use_fp16_attr.emplace_back(use_fp16); - use_uint8_attr.emplace_back(use_uint8); - node->SetAttr("tensorrt_version", tensorrt_version_attr); - node->SetAttr("use_implicit_batch", use_implicit_batch_attr); - node->SetAttr("max_workspace_size", max_workspace_size_attr); - node->SetAttr("use_fp16", use_fp16_attr); - node->SetAttr("use_uint8", use_uint8_attr); } + + /*! \brief The "tensorrt" Target guiding compilation. */ + Target target_; }; void CollectFromCompositeFunctionBody::VisitExpr_(const ConstantNode* constant_node) { @@ -304,64 +343,74 @@ void CollectFromCompositeFunctionBody::VisitExpr_(const CallNode* call_node) { } /*! - * \brief Create a runtime module for TensorRT. - * \param ref The ext_func Relay expression/module to be executed using extern ops. - * \return A runtime module. - */ -runtime::Module TensorRTCompiler(const ObjectRef& ref) { - ICHECK(ref->IsInstance()) << "The input ref is expected to be a Relay function."; - Function func = Downcast(ref); - std::string func_name = backend::GetExtSymbol(func); - - VLOG(1) << "TensorRT partition:" << std::endl << PrettyPrint(func); - TensorRTJSONSerializer serializer(func_name, func); - serializer.serialize(); - std::string graph_json = serializer.GetJSON(); - VLOG(1) << "TensorRT JSON:" << std::endl << graph_json; - - // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes - // a callback which calls backend::UpdateConstants to capture the map before the function - // 'disappears' into lowered form, on the assumption the visit order and thus constant - // names match those generated by the JSONSerializer. - - const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create"); - ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function."; - VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'"; - runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names()); - return lib; -} - -TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler); - -/*! - * \brief Check whether TensorRT graph executor is enabled. - * \return True if enabled, False if not. + * \brief The main TensorRT compiler. + * + * TODO(mbs): Currently we create a \p TensorRTRuntimeModule for every function with + * Compiler="tensorrt" (ie for each partition). Since the TensorRT engine is only designed to + * handle a single entry point this is mostly sensible, however there are probably opportunities + * for more sharing between functions. However, note this means each call to a TensorRT-compiled + * function will require a linear scan of imported runtime modules to find the matching + * TensorRTRuntimeModule implementing it. */ -inline constexpr bool IsTensorRTRuntimeEnabled() { -#if TVM_GRAPH_EXECUTOR_TENSORRT - return true; -#else - return false; -#endif // TVM_GRAPH_EXECUTOR_TENSORRT +tvm::transform::Pass CompileForTensorRTImpl() { + auto pass_func = [](IRModule mod, const tvm::transform::PassContext& pass_ctx) { + VLOG(1) << "CompileForTensorRT input:" << std::endl << PrettyPrint(mod); + Target target = GetTensorRTTarget(); + + const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create"); + ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function."; + + // The accumulated external runtime modules. + Array external_mods = + mod->GetAttr>(tvm::attr::kExternalMods).value_or({}); + // The accumulated constant bindings. + Map const_name_to_constant = + mod->GetAttr>(tvm::attr::kConstNameToConstant).value_or({}); + + for (const auto& kv : mod->functions) { + if (const auto* function_node = kv.second.as()) { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + Optional opt_compiler = function_node->GetAttr(attr::kCompiler); + if (opt_compiler && opt_compiler.value() == "tensorrt") { + // Serialize the function to JSON. + TensorRTJSONSerializer serializer(target, kv.first->name_hint, + GetRef(function_node)); + serializer.serialize(); + std::string graph_json = serializer.GetJSON(); + VLOG(1) << "TensorRT JSON for '" << kv.first->name_hint << "':" << std::endl + << graph_json; + + // Remember all the constant bindings. + for (const auto& kv2 : serializer.const_name_to_constant()) { + ICHECK_EQ(const_name_to_constant.count(kv2.first), 0); + VLOG(1) << "binding constant '" << kv2.first << "' for function '" + << kv.first->name_hint << "'"; + const_name_to_constant.Set(kv2.first, kv2.second); + } + + // Create the actual runtime module. + runtime::Module runtime_mod = + (*pf)(kv.first->name_hint, graph_json, serializer.const_names()); + + // Remember the runtime module. + external_mods.push_back(runtime_mod); + } + } + } + } + return WithAttrs(mod, {{tvm::attr::kExternalMods, external_mods}, + {tvm::attr::kConstNameToConstant, const_name_to_constant}}); + }; + return tvm::transform::CreateModulePass(pass_func, 0, "CompileForTensorRT", {}); } -/*! - * \brief Get TensorRT version that TVM is built against. - * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph - * runtime is not enabled. - */ -Array GetTensorRTVersion() { -#if TVM_GRAPH_EXECUTOR_TENSORRT - return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)}; -#else - return {}; -#endif // TVM_GRAPH_EXECUTOR_TENSORRT +tvm::transform::Pass CompileForTensorRT() { + return transform::Sequential( + {transform::OutlineCompilerFunctionsWithExistingGlobalSymbols("tensorrt"), + CompileForTensorRTImpl(), transform::MarkCompilerFunctionsAsExtern("tensorrt")}); } -TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled") - .set_body_typed(IsTensorRTRuntimeEnabled); -TVM_REGISTER_GLOBAL("relay.op.get_tensorrt_version").set_body_typed(GetTensorRTVersion); - +} // namespace tensorrt } // namespace contrib } // namespace relay } // namespace tvm diff --git a/src/relay/backend/contrib/tensorrt/codegen.h b/src/relay/backend/contrib/tensorrt/codegen.h new file mode 100644 index 0000000000000..813a8663756dd --- /dev/null +++ b/src/relay/backend/contrib/tensorrt/codegen.h @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/backend/contrib/tensorrt/codegen.h + * \brief The 'custom' compilation pass for TensorRT (invoked by the RelayToTIRTargetHook pass). + */ + +#ifndef TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_ +#define TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_ + +#include + +namespace tvm { +namespace relay { +namespace contrib { +namespace tensorrt { + +/*! + * \brief Returns the pass which replaces all calls to "Primitive" functions with a "Compiler" + * attribute of "tensorrt" with calls to an extern which is implemented by a \p TensorRTRuntime + * runtime module added to the IRModule's "external_mods" attribute. + */ +transform::Pass CompileForTensorRT(); + +} // namespace tensorrt +} // namespace contrib +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_ diff --git a/src/relay/backend/contrib/tensorrt/target.cc b/src/relay/backend/contrib/tensorrt/target.cc index 85d127ab71152..2e4581d30a3c6 100644 --- a/src/relay/backend/contrib/tensorrt/target.cc +++ b/src/relay/backend/contrib/tensorrt/target.cc @@ -24,19 +24,46 @@ #include +#include "./codegen.h" + namespace tvm { namespace relay { namespace contrib { +namespace tensorrt { /*! * \brief This external codegen target can offload compilation to the TensorRT compiler. * - Patterns: python/tvm/relay/op/contrib/tensorrt.py * - Custom compiler: src/relay/backend/contrib/tensorrt/codegen.cc - * - Runtime: src/runtime/contrib/tensorrt/ *.cc + * - Runtime: src/runtime/contrib/tensorrt/... */ TVM_REGISTER_TARGET_KIND("tensorrt", kDLCUDA) - .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)); + .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)) + .set_attr("RelayToTIR", CompileForTensorRT()) + // A array of three integers given the major, minor, and patch numbers for the supported + // TensorRT compiler version. If empty will be auto-detected from linked library. Default empty. + .add_attr_option>("tensorrt_version", Array()) + // If true, the first tensor dimension for most operators is allowed to be Any and + // TensorRT will assume it represents a batch dimension only known at inference time. + // Fewer Relay operators are supported in implicit batch mode. Default true. + .add_attr_option("use_implicit_batch", Bool(true)) + // If true, excludes sub-graphs which do not have multiply-accumulate operations, even though + // TensorRT supports them. ad. This is a simple heuristic to optimize the partitioning between + // TensorRT and TVM. Not required if using Collage for partitioning. Defalut false. + .add_attr_option("remove_no_mac_subgraphs", Bool(false)) + // How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. + // Default 1G. + .add_attr_option("max_workspace_size", Integer(1 << 30)) + // If true, allows TensorRT to automatically convert float32 operations to float16. Must also be + // enabled if any float16 operations are in the model. Note that TensorRT may still choose a + // higher-precision kernel if it results in overall lower runtime, or if no low-precision + // implementation exists. Default false. + .add_attr_option("use_fp16", Bool(false)) + // If true, allows TensorRT to automatically convert float32 operations to uint8 + // (aka quantized). Default false. + .add_attr_option("use_uint8", Bool(false)); +} // namespace tensorrt } // namespace contrib } // namespace relay } // namespace tvm diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc index 0df9f5ee294c0..1dafcd10a361f 100644 --- a/src/relay/transforms/compiler_function_utils.cc +++ b/src/relay/transforms/compiler_function_utils.cc @@ -24,14 +24,13 @@ #include "./compiler_function_utils.h" -#include "../op/call/call.h" #include "tvm/relay/analysis.h" #include "tvm/relay/expr_functor.h" #include "tvm/relay/transform.h" namespace tvm { namespace relay { -namespace transforms { +namespace transform { namespace { /*! @@ -211,8 +210,8 @@ GlobalVar ExistingGlobalSymbolCache::GetGlobalSymbol(const Function& function) { return global_var; } -transform::Pass OutlineCompilerFunctions(std::shared_ptr cache, - std::string compiler_filter) { +tvm::transform::Pass OutlineCompilerFunctions(std::shared_ptr cache, + std::string compiler_filter) { runtime::TypedPackedFunc pass_func = [cache = std::move(cache), compiler_filter = std::move(compiler_filter)]( IRModule mod, transform::PassContext ctx) { @@ -235,12 +234,13 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr cach } // Any Java programmers in the house? -transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter) { +tvm::transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols( + std::string compiler_filter) { return OutlineCompilerFunctions(std::make_shared(), std::move(compiler_filter)); } -transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) { +tvm::transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) { runtime::TypedPackedFunc pass_func = [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) { VLOG(1) << "MarkCompilerFunctionsAsExtern input:" << std::endl << PrettyPrint(mod); @@ -262,7 +262,7 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) { return tvm::transform::CreateModulePass(pass_func, 0, "MarkCompilerFunctionsAsExtern", {}); } -transform::Pass InlineCompilerFunctionsBoundTo(Array global_vars) { +tvm::transform::Pass InlineCompilerFunctionsBoundTo(Array global_vars) { runtime::TypedPackedFunc pass_func = [global_vars = std::move(global_vars)](IRModule mod, transform::PassContext ctx) { VLOG(1) << "InlineCompilerFunctionsBoundTo with global_vars: " << PrettyPrint(global_vars); @@ -295,6 +295,6 @@ TVM_REGISTER_GLOBAL("relay._transform.MarkCompilerFunctionsAsExtern") TVM_REGISTER_GLOBAL("relay._transform.InlineCompilerFunctionsBoundTo") .set_body_typed(InlineCompilerFunctionsBoundTo); -} // namespace transforms +} // namespace transform } // namespace relay } // namespace tvm diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h index aa98430318a69..f3499faec2628 100644 --- a/src/relay/transforms/compiler_function_utils.h +++ b/src/relay/transforms/compiler_function_utils.h @@ -66,7 +66,7 @@ namespace tvm { namespace relay { -namespace transforms { +namespace transform { /*! * \brief Abstract class representing a cache of unique global vars keyed by functions. This can @@ -105,8 +105,8 @@ class ExistingGlobalSymbolCache : public GlobalSymbolCache { * If \p compiler_filter is non-empty only functions with that as their attribute value are * outlined. */ -transform::Pass OutlineCompilerFunctions(std::shared_ptr cache, - std::string compiler_filter = ""); +tvm::transform::Pass OutlineCompilerFunctions(std::shared_ptr cache, + std::string compiler_filter = ""); /*! * \brief A pass to outline all let-bound and literal functions in direct call positions which have @@ -119,7 +119,8 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr cach * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism * to prepare the IRModule before custom lowering. */ -transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter = ""); +tvm::transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols( + std::string compiler_filter = ""); /*! * \brief A pass to mark all global functions which have a "Compiler" attribute matching @@ -132,7 +133,7 @@ transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string co * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to * cleanup the IRModule after custom lowering. */ -transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = ""); +tvm::transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = ""); /*! * \brief A pass to inline all global "Compiler" functions which are bound to a global var @@ -142,9 +143,9 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = ""); * This pass may be useful for external codegen which needs to undo partitioning based on * properties of the entire partition. */ -transform::Pass InlineCompilerFunctionsBoundTo(Array global_vars); +tvm::transform::Pass InlineCompilerFunctionsBoundTo(Array global_vars); -} // namespace transforms +} // namespace transform } // namespace relay } // namespace tvm diff --git a/src/runtime/const_loader_module.cc b/src/runtime/const_loader_module.cc index 2e91d26d5f965..a8028e616c5ba 100644 --- a/src/runtime/const_loader_module.cc +++ b/src/runtime/const_loader_module.cc @@ -51,15 +51,24 @@ class ConstLoaderModuleNode : public ModuleNode { const std::unordered_map& const_var_ndarray, const std::unordered_map>& const_vars_by_symbol) : const_var_ndarray_(const_var_ndarray), const_vars_by_symbol_(const_vars_by_symbol) { + VLOG(1) << "Creating ConstLoaderModule"; // Only the related submodules are cached to reduce the number of runtime // symbol lookup for initialization. Otherwise, symbols/primitives in the // DSO module will also be cached but they never need to be initialized. - for (const auto& it : const_vars_by_symbol_) { - initialized_[it.first] = false; + for (const auto& kv : const_vars_by_symbol_) { + for (const auto& var : kv.second) { + VLOG(1) << "ConstLoaderModuleNode has constant '" << var << "' for function '" << kv.first + << "'"; + ICHECK_GT(const_var_ndarray_.count(var), 0) + << "ConstLoaderModuleNode is missing entry for constant '" << var << "' for function '" + << kv.first << "'"; + } + initialized_[kv.first] = false; } } PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) final { + VLOG(1) << "ConstLoaderModuleNode::GetFunction(" << name << ")"; // Initialize and memoize the module. // Usually, we have some warmup runs. The module initialization should be // done at this stage. Therefore, runtime overhead is not a concern. @@ -88,11 +97,13 @@ class ConstLoaderModuleNode : public ModuleNode { */ Array GetRequiredConstants(const std::string& symbol) { Array ret; - ICHECK_GT(const_vars_by_symbol_.count(symbol), 0U) << "No symbol is recorded for " << symbol; + ICHECK_GT(const_vars_by_symbol_.count(symbol), 0U) + << "No constants known for function '" << symbol << "'"; std::vector vars = const_vars_by_symbol_[symbol]; - for (const auto& it : vars) { - ICHECK_GT(const_var_ndarray_.count(it), 0U) << "Found not recorded constant variable: " << it; - ret.push_back(const_var_ndarray_[it]); + for (const auto& var : vars) { + ICHECK_GT(const_var_ndarray_.count(var), 0U) + << "No such constant variable '" << var << "' for function '" << symbol << "'"; + ret.push_back(const_var_ndarray_[var]); } return ret; } @@ -229,5 +240,6 @@ TVM_REGISTER_GLOBAL("runtime.module.loadbinary_metadata") .set_body_typed(ConstLoaderModuleNode::LoadFromBinary); TVM_REGISTER_GLOBAL("runtime.module.loadbinary_const_loader") .set_body_typed(ConstLoaderModuleNode::LoadFromBinary); + } // namespace runtime } // namespace tvm diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h index 355390765de70..3a02202b87f28 100644 --- a/src/runtime/contrib/json/json_runtime.h +++ b/src/runtime/contrib/json/json_runtime.h @@ -54,6 +54,8 @@ class JSONRuntimeBase : public ModuleNode { LoadGraph(graph_json_); } + ~JSONRuntimeBase() override = default; + const char* type_key() const override { return "json"; } // May be overridden /*! \brief Initialize a specific json runtime. */ diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc index 5f923667d0c20..436a6db4c8d4b 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc @@ -45,10 +45,11 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger, max_workspace_size_(max_workspace_size), use_implicit_batch_(use_implicit_batch), use_fp16_(use_fp16), - batch_size_(batch_size) { + use_int8_(false), + batch_size_(batch_size), + calibrator_(calibrator) { // Create TRT builder and network. builder_ = nvinfer1::createInferBuilder(*logger); - use_int8_ = false; #if TRT_VERSION_GE(6, 0, 1) // Use INetworkV2. @@ -58,8 +59,7 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger, flags = 0U; builder_->setMaxBatchSize(batch_size_); } - this->calibrator_ = calibrator; - if (calibrator != nullptr) { + if (calibrator_ != nullptr) { use_int8_ = true; } network_ = builder_->createNetworkV2(flags); @@ -177,6 +177,7 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() { if (use_int8_) { config_->setFlag(nvinfer1::BuilderFlag::kINT8); + ICHECK(calibrator_); config_->setInt8Calibrator(calibrator_); LOG(INFO) << "config finishes setting up calibrator as INT8 mode ... "; } @@ -210,6 +211,9 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() { nvinfer1::IExecutionContext* context = engine->createExecutionContext(); CleanUp(); + ICHECK(engine); + ICHECK(context); + return {engine, context, network_input_names_, network_output_names_}; } @@ -254,18 +258,33 @@ nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& inpu } void TensorRTBuilder::CleanUp() { + VLOG(1) << "Destroying TensorRT network"; + ICHECK(network_); network_->destroy(); + network_ = nullptr; + #if TRT_VERSION_GE(6, 0, 1) + VLOG(1) << "Destroying TensorRT config"; + ICHECK(config_); config_->destroy(); + config_ = nullptr; #endif + + VLOG(1) << "Destroying TensorRT builder"; + ICHECK(builder_); builder_->destroy(); + builder_ = nullptr; + + VLOG(1) << "Destroying TensorRT weights"; for (auto weight : trt_weights_) { + ICHECK(weight.values); if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) { delete[] static_cast(weight.values); } else { delete[] static_cast(weight.values); } } + trt_weights_.clear(); } } // namespace contrib diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h index 13a118340e111..9bccc1ea48483 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.h +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h @@ -48,8 +48,8 @@ using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; * perform inference. */ struct TensorRTEngineAndContext { - nvinfer1::ICudaEngine* engine; - nvinfer1::IExecutionContext* context; + nvinfer1::ICudaEngine* engine = nullptr; + nvinfer1::IExecutionContext* context = nullptr; std::vector inputs; std::vector outputs; }; @@ -125,15 +125,15 @@ class TensorRTBuilder { std::unordered_map> node_output_map_; /*! \brief TensorRT builder. */ - nvinfer1::IBuilder* builder_; + nvinfer1::IBuilder* builder_ = nullptr; #if TRT_VERSION_GE(6, 0, 1) /*! \brief TensorRT builder config. */ - nvinfer1::IBuilderConfig* config_; + nvinfer1::IBuilderConfig* config_ = nullptr; #endif /*! \brief TensorRT network definition. */ - nvinfer1::INetworkDefinition* network_; + nvinfer1::INetworkDefinition* network_ = nullptr; /*! \brief List of all weights held in memory. */ std::vector trt_weights_; diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc index 3971081bf8f8a..cd46967e532b7 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc @@ -67,7 +67,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* par // Batch dimension cannot be modified. ICHECK_EQ(input->getDimensions().nbDims, order.size() - 1); ICHECK_EQ(order[0], 0); - for (size_t i = 0; i < order.size(); ++i) { + for (size_t i = 0; i + 1 < order.size(); ++i) { perm.order[i] = order[i + 1] - 1; } } else { @@ -880,7 +880,7 @@ class ConcatOpConverter : public TensorRTOpConverter { const int input_rank = params->inputs[0].tensor->getDimensions().nbDims; std::vector input_tensors; for (auto input : params->inputs) { - ICHECK(input.type == kTensor); + ICHECK_EQ(input.type, kTensor); ICHECK_EQ(input_rank, input.tensor->getDimensions().nbDims); input_tensors.push_back(input.tensor); } diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index 18ffdbbbba858..b51684b95eb86 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -138,13 +138,21 @@ class TensorRTRuntime : public JSONRuntimeBase { /*! \brief Destroy engines and contexts. */ void DestroyEngines() { for (auto& it : trt_engine_cache_) { + VLOG(1) << "Destroying TensorRT context for function '" << it.first.first << "' (batch size " + << it.first.second << ")"; it.second.context->destroy(); + VLOG(1) << "Destroying TensorRT engine for function '" << it.first.first << "' (batch size " + << it.first.second << ")"; it.second.engine->destroy(); } trt_engine_cache_.clear(); } - ~TensorRTRuntime() { DestroyEngines(); } + ~TensorRTRuntime() override { + VLOG(1) << "Destroying TensorRT runtime"; + DestroyEngines(); + VLOG(1) << "Destroyed TensorRT runtime"; + } /*! \brief Run inference using built engine. */ void Run() override { @@ -467,7 +475,7 @@ class TensorRTRuntime : public JSONRuntimeBase { /*! \brief TensorRT logger. */ TensorRTLogger logger_; -#else +#else // TVM_GRAPH_EXECUTOR_TENSORRT void Run() override { LOG(FATAL) << "TensorRT runtime is not enabled. " << "Please build with USE_TENSORRT_RUNTIME."; @@ -481,7 +489,7 @@ class TensorRTRuntime : public JSONRuntimeBase { bool GetCachedEnginesFromDisk() { return false; } void CacheEngineToDisk() {} -#endif +#endif // TVM_GRAPH_EXECUTOR_TENSORRT bool use_implicit_batch_; diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc index ec301d10812fc..e5ca82d5c0996 100644 --- a/src/target/metadata_module.cc +++ b/src/target/metadata_module.cc @@ -215,8 +215,6 @@ runtime::Module CreateMetadataModule( String symbol = pf_sym(); Array variables = pf_var(); for (size_t i = 0; i < variables.size(); i++) { - VLOG(1) << "From module of type '" << mod->type_key() << "' found const var '" - << variables[i] << "' for symbol '" << symbol << "'"; symbol_const_vars.push_back(variables[i].operator std::string()); } ICHECK_EQ(const_vars_by_symbol.count(symbol), 0U) << "Found duplicated symbol: " << symbol; diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index cecb64785a49a..9e39821fd3173 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -14,31 +14,37 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import tvm.testing + import numpy as np import pytest import itertools +import logging +from typing import Tuple +try: + # See issue #9362. + import torch +except: + pass import tvm +import tvm.testing import tvm.relay.testing from tvm import relay -from tvm.relay.op.contrib import tensorrt - from tvm.relay import Any, GlobalVar - from tvm.relay.expr_functor import ExprVisitor -from typing import Tuple from tvm.contrib.download import download from tvm.relay.op.contrib import tensorrt - SUPPORTED_DTYPES = ["float16", "float32"] has_tensorrt_codegen = pytest.mark.skipif( - not tvm.get_global_func("relay.ext.tensorrt", True), reason="TensorRT codegen not available" + not tensorrt.is_tensorrt_compiler_enabled(), reason="TensorRT codegen not available" ) + +# CAUTION: Currently always false in CI since adds tens of minutes to test time and depends +# on TensorRT installation. See https://github.com/apache/tvm/issues/11765 has_tensorrt_runtime = pytest.mark.skipif( not tensorrt.is_tensorrt_runtime_enabled(), reason="TensorRT runtime not available" ) @@ -72,7 +78,7 @@ def assert_result_dict_holds(result_dict, dtype="float16"): tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=5e-3) -def set_func_attr(func, compile_name, symbol_name): +def set_outer_func_attr(func, compile_name, symbol_name): func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Compiler", compile_name) @@ -80,6 +86,12 @@ def set_func_attr(func, compile_name, symbol_name): return func +def set_inner_func_attr(func, pattern_name, composite_name): + func = func.with_attr("PartitionedFromPattern", pattern_name) + func = func.with_attr("Composite", composite_name) + return func + + def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. @@ -110,34 +122,31 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float result_dict = dict() for mode in ["vm", "graph"]: - for mode in ["graph"]: - for use_trt in [True, False]: - mod = tvm.IRModule() - mod["main"] = f - result_key = mode + ("_trt" if use_trt else "") - if use_trt: - mod = relay.transform.InferType()(mod) - mod, config = tensorrt.partition_for_tensorrt( - mod, params, use_fp16=data_type == "float16" - ) - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): - func = relay.create_executor( - mode, mod=mod, device=dev, target=target - ).evaluate() - else: - mod = relay.transform.InferType()(mod) - with tvm.transform.PassContext(opt_level=3): - func = relay.create_executor( - mode, mod=mod, device=dev, target=target - ).evaluate() + for use_trt in [True, False]: + mod = tvm.IRModule() + mod["main"] = f + result_key = mode + ("_trt" if use_trt else "") + if use_trt: + use_fp16 = data_type == "float16" + trt_target = tvm.target.Target(f"tensorrt -use_fp16={use_fp16}") + mod = relay.transform.InferType()(mod) + mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target) + with tvm.transform.PassContext(opt_level=3): + func = relay.create_executor( + mode, mod=mod, device=dev, target=[target, trt_target] + ).evaluate() + else: + mod = relay.transform.InferType()(mod) + with tvm.transform.PassContext(opt_level=3): + func = relay.create_executor( + mode, mod=mod, device=dev, target=target + ).evaluate() - if run_module: - result_dict[result_key] = func(**input_dict, **params) + if run_module: + result_dict[result_key] = func(**input_dict, **params) - if run_module: - assert_result_dict_holds(result_dict, data_type) + if run_module: + assert_result_dict_holds(result_dict, data_type) def test_tensorrt_simple(run_module): @@ -163,10 +172,8 @@ def test_tensorrt_simple(run_module): result_key = mode + ("_trt" if use_trt else "") if use_trt: mod = relay.transform.InferType()(mod) - mod, config = tensorrt.partition_for_tensorrt(mod) - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): + mod = tensorrt.partition_for_tensorrt(mod) + with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda" ).evaluate() @@ -212,9 +219,9 @@ def test_tensorrt_not_compatible(run_module): f = relay.Function([x], out) mod = tvm.IRModule() mod["main"] = f - mod, config = tensorrt.partition_for_tensorrt(mod) + mod = tensorrt.partition_for_tensorrt(mod) for mode in ["graph", "vm"]: - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda" ).evaluate() @@ -622,26 +629,18 @@ def are_ops_on_graph(self, subgraph) -> bool: def are_ops_on_trt(mod, op_list): + op_on_trt = False + op_on_tvm = False for subgraph in mod.get_global_vars(): name = subgraph.name_hint - op_on_trt = False - op_on_tvm = True - if name == "main": - op_on_tvm = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) - elif mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt": - op_on_trt = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) + if mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt": + op_on_trt |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) else: - op_on_tvm &= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) - - if not op_on_trt or op_on_tvm: - return False + op_on_tvm |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) - return True + return op_on_trt and not op_on_tvm -@pytest.mark.xfail( - reason=("Currently failing test. See tracking issue https://github.com/apache/tvm/issues/8901") -) def test_dynamic_reshape(run_module): def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): result_arr = [{} for _ in range(len(x_data_list))] @@ -652,9 +651,9 @@ def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): mod = tvm.IRModule() mod["main"] = f if use_trt: - mod, _ = tensorrt.partition_for_tensorrt( - mod, params={}, remove_no_mac_subgraphs=False - ) + logging.info("Before partitioning:\n%s", mod) + mod = tensorrt.partition_for_tensorrt(mod) + logging.info("After partitioning:\n%s", mod) assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt if run_module: with relay.build_config(opt_level=3): @@ -1051,6 +1050,7 @@ def get_graph(d_type="float16"): run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type) +@pytest.mark.skip(reason=("Fails assert_allclose. See https://github.com/apache/tvm/issues/11765")) def test_conv3d(run_module): def get_graph( x_shape=(1, 24, 8, 8, 8), @@ -1143,11 +1143,7 @@ def get_graph( ) -@pytest.mark.xfail( - reason=("Currently failing test. See tracking issue https://github.com/apache/tvm/issues/8901") -) @has_tensorrt_codegen -@tvm.testing.requires_cuda def test_dynamic_offload(): """ This test checks for proper dynamic offloading of relay graphs. An addition between @@ -1161,24 +1157,29 @@ def test_dynamic_offload(): x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32") y = relay.var("y", shape=(data_shape), dtype="float32") - kernel = relay.var("kernel", shape=(k_shape), dtype="float32") + kernel = relay.const(np.random.rand(*k_shape).astype("float32")) def get_expected(): # Create a nested TRT function that matches the expected output mod = tvm.IRModule() - var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") - kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32") - out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4]) - f1 = GlobalVar("tvmgen_default_tensorrt_0") - func = relay.Function([var1, kernel_trt], out1) - func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0") - mod[f1] = func + outer_var = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") + inner_var = relay.var("FunctionVar_0_0", shape=(data_shape), dtype="float32") + inner_body = relay.nn.conv2d( + inner_var, kernel, channels=k_shape[0], kernel_size=k_shape[2:4] + ) + inner_func = relay.Function([inner_var], inner_body) + inner_func = set_inner_func_attr(inner_func, "nn.conv2d_", "tensorrt.nn.conv2d") + outer_body = inner_func(outer_var) + outer_func = relay.Function([outer_var], outer_body) + outer_func = set_outer_func_attr(outer_func, "tensorrt", "tvmgen_default_tensorrt_main_0") + gv = GlobalVar("tvmgen_default_tensorrt_main_0") + mod[gv] = outer_func mod = relay.transform.InferType()(mod) # Create the main function out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) - out = relay.add(out1, f1(y, kernel)) - f = relay.Function([x, y, kernel], out) + out = relay.add(out1, gv(y)) + f = relay.Function([x, y], out) mod["main"] = f mod = relay.transform.InferType()(mod) return mod @@ -1187,13 +1188,13 @@ def get_expected(): out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out = relay.add(out1, out2) - f = relay.Function([x, y, kernel], out) + f = relay.Function([x, y], out) # Pass the function to TRT compilation mod = tvm.IRModule() mod["main"] = f mod = relay.transform.InferType()(mod) - mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={}) + mod_trt = tensorrt.partition_for_tensorrt(mod) # Get the expected relay graph and compare mod_exp = get_expected() @@ -1212,7 +1213,7 @@ def test_tensorrt_dynamic_batch(run_module): mod = tvm.IRModule() mod["main"] = f if use_trt: - mod, _ = tensorrt.partition_for_tensorrt(mod) + mod = tensorrt.partition_for_tensorrt(mod) if run_module: with relay.build_config(opt_level=3): @@ -1242,17 +1243,17 @@ def test_tensorrt_dynamic_batch_conv(run_module): f = relay.Function([x, kernel], out) mod = tvm.IRModule() mod["main"] = f + trt_target = tvm.target.Target(f"tensorrt -use_implicit_batch={use_implicit_batch}") if use_trt: - mod, config = tensorrt.partition_for_tensorrt( - mod, params, use_implicit_batch=use_implicit_batch - ) + mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target) if run_module: for target in ["llvm", "cuda"]: - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): + targets = [target] + if use_trt: + targets.append(trt_target) + with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( - "vm", mod=mod, device=tvm.device(target), target=target + "vm", mod=mod, device=tvm.device(target), target=targets ).evaluate() for i, batch_size in enumerate(batches_to_test): result_arr[i][target][use_trt] = func(x_data[:batch_size, ...], **params) @@ -1281,9 +1282,11 @@ def convert_traced_model_to_vm_trt( input_name = "input0" shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(traced_module, shape_list) - mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True) + trt_target = tvm.target.Target("tensorrt -remove_no_mac_subgraphs=True") + mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target) + targets = [target, trt_target] with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): - vm_trt_exec = relay.vm.compile(mod, target=target, params=params) + vm_trt_exec = relay.vm.compile(mod, target=targets, params=params) return vm_trt_exec @@ -1381,7 +1384,7 @@ def test_empty_subgraph(run_module): var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32") f1 = GlobalVar("tensorrt_0") func = relay.Function([var1], var1) - func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0") + func = set_outer_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0") mod[f1] = func mod = relay.transform.InferType()(mod) @@ -1402,4 +1405,5 @@ def test_empty_subgraph(run_module): if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) tvm.testing.main() diff --git a/tests/python/contrib/test_tensorrt_int8_exp.py b/tests/python/contrib/test_tensorrt_int8_exp.py index 84360e92d33b9..304d9a095e84a 100644 --- a/tests/python/contrib/test_tensorrt_int8_exp.py +++ b/tests/python/contrib/test_tensorrt_int8_exp.py @@ -18,8 +18,14 @@ import os import numpy as np +try: + # See issue #9362. + import torch +except: + pass + import tvm -import tvm.relay.testing +import tvm.testing from tvm import relay from tvm.contrib.download import download_testdata from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt @@ -31,9 +37,10 @@ def skip_codegen_test(): if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist: print("Skip because CUDA is not enabled.") return True - if not tvm.get_global_func("relay.ext.tensorrt", True): - print("Skip because TensorRT codegen is not available.") + if not tensorrt.is_tensorrt_compiler_enabled(): + print("Skip because TensorRT compiler is not available.") return True + print("TensorRT compiler is available!") return False @@ -44,6 +51,7 @@ def skip_runtime_test(): if not tensorrt.is_tensorrt_runtime_enabled(): print("Skip because TensorRT runtime is not available.") return True + print("TensorRT runtime is available!") return False @@ -102,12 +110,11 @@ def test_trt_int8(): # compile the model target = "cuda" - dev = tvm.cuda(1) - mod, config = partition_for_tensorrt(mod, params) - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): + dev = tvm.cuda() + mod = partition_for_tensorrt(mod, params) + with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) - dtype = "float32" gen_module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) num_cali_int8 = int(os.environ["TENSORRT_NUM_CALI_INT8"]) @@ -146,4 +153,4 @@ def test_trt_int8(): if __name__ == "__main__": - pytest.main([__file__]) + tvm.testing.main() From 286a51921d9f7ab376c09a6c2d0882768da14767 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 1 Jul 2022 17:18:10 -0500 Subject: [PATCH 042/111] [LLVM] Remove use of deprecated PointerType::getPointerElementType() (#11984) With LLVM switching to opaque (typeless) pointer types, some functions related to handling typed pointers are being deprecated (and will be removed). The DWARF debug information does express pointee type. When constructing this information from opaque pointers in LLVM IR, the pointee type needs to be obtained from somewhere else (not the pointer). Change the debug info generation to use the original PrimFunc to obtain the necessary type information. This will work with older versions of LLVM as well. --- src/target/llvm/codegen_cpu.cc | 87 ++++++++++++++++++---------------- src/target/llvm/codegen_cpu.h | 5 +- 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index bf0fe1502b9b9..e8647545e5f86 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -191,64 +191,68 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) { export_system_symbols_.emplace_back( std::make_pair(global_symbol.value().operator std::string(), function_)); } - AddDebugInformation(function_); + AddDebugInformation(f, function_); } // Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv -void CodeGenCPU::AddDebugInformation(llvm::Function* function) { +void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { #if TVM_LLVM_VERSION >= 50 && TVM_LLVM_VERSION < 70 - ICHECK(!function->getSubprogram()); + ICHECK(!f_llvm->getSubprogram()); llvm::SmallVector paramTys; - llvm::DIType* returnTy = - getDebugType(builder_.get(), dbg_info_->di_builder_.get(), function->getReturnType()); + // Functions in TIR can only return void or an int. + ICHECK(f_llvm->getReturnType() == t_void_ || f_llvm->getReturnType() == t_int_) + << "Unexpected return type"; + auto ret_type_tir = f_llvm->getReturnType() == t_int_ ? DataType::Int(32) : DataType::Void(); + llvm::DIType* returnTy = GetDebugType(ret_type_tir, f_llvm->getReturnType()); paramTys.push_back(returnTy); - for (size_t i = 0; i < function->arg_size(); ++i) { - paramTys.push_back(getDebugType(builder_.get(), dbg_info_->di_builder_.get(), - function->getFunctionType()->getParamType(i))); + for (size_t i = 0; i < f_llvm->arg_size(); ++i) { + paramTys.push_back( + GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i))); } auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType( dbg_info_->di_builder_->getOrCreateTypeArray(paramTys)); #if TVM_LLVM_VERSION >= 80 auto* DIFunction = dbg_info_->di_builder_->createFunction( - dbg_info_->file_, function->getName(), "", dbg_info_->file_, 0 /* line number */, - DIFunctionTy, false /* internal linkage */); + /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"", + /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy, + /*ScopeLine=*/0); #else auto* DIFunction = dbg_info_->di_builder_->createFunction( - dbg_info_->file_, function->getName(), "", dbg_info_->file_, 0 /* line number */, - DIFunctionTy, false, /* internal linkage */ - true, 0 /* line number */, llvm::DINode::FlagPrototyped, true /* isOptimized */); + /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"", + /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy, + /*isLocalToUnit=*/false, /*isDefinition=*/true, /*ScopeLine=*/0, + /*Flags=*/llvm::DINode::FlagPrototyped, /*isOptimized=*/true); #endif ICHECK(DIFunction); - function->setSubprogram(DIFunction); - ICHECK_EQ(function->getSubprogram(), DIFunction); + f_llvm->setSubprogram(DIFunction); + ICHECK_EQ(f_llvm->getSubprogram(), DIFunction); - IRBuilder builder(&function->getEntryBlock()); - if (!function->getEntryBlock().empty()) { - builder.SetInsertPoint(&function->getEntryBlock().front()); + IRBuilder builder(&f_llvm->getEntryBlock()); + if (!f_llvm->getEntryBlock().empty()) { + builder.SetInsertPoint(&f_llvm->getEntryBlock().front()); } llvm::DebugLoc DL; builder.SetCurrentDebugLocation(DL); - for (size_t i = 0; i < function->arg_size(); ++i) { - auto* paramAlloca = builder.CreateAlloca(function->getFunctionType()->getParamType(i)); + for (size_t i = 0; i < f_llvm->arg_size(); ++i) { + auto* paramAlloca = builder.CreateAlloca(f_llvm->getFunctionType()->getParamType(i)); std::string paramName = "arg" + std::to_string(i + 1); auto param = dbg_info_->di_builder_->createParameterVariable( DIFunction, paramName, i + 1, dbg_info_->file_, 0, - getDebugType(builder_.get(), dbg_info_->di_builder_.get(), - function->getFunctionType()->getParamType(i)), - /* alwaysPreserve */ true); - auto* store = builder.CreateStore(function->arg_begin() + i, paramAlloca); + GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i)), + /*alwaysPreserve=*/true); + auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca); dbg_info_->di_builder_->insertDeclare(paramAlloca, param, dbg_info_->di_builder_->createExpression(), llvm::DebugLoc::get(0, 0, DIFunction), store); } - dbg_info_->di_builder_->finalizeSubprogram(function->getSubprogram()); - auto* scope = function->getSubprogram(); + dbg_info_->di_builder_->finalizeSubprogram(f_llvm->getSubprogram()); + auto* scope = f_llvm->getSubprogram(); if (!scope) { return; } - for (auto& BB : *function) { + for (auto& BB : *f_llvm) { for (auto& I : BB) { if (I.getDebugLoc()) { continue; @@ -259,24 +263,25 @@ void CodeGenCPU::AddDebugInformation(llvm::Function* function) { #endif } -llvm::DIType* CodeGenCPU::getDebugType(IRBuilder* builder, llvm::DIBuilder* di_builder, - llvm::Type* ty) { - if (ty == builder->getVoidTy()) { +llvm::DIType* CodeGenCPU::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm) { + if (ty_llvm == t_void_) { return nullptr; - } else if (ty == builder->getFloatTy()) { - return di_builder->createBasicType("float", 32, llvm::dwarf::DW_ATE_float); - } else if (ty == builder->getInt8Ty()) { - return di_builder->createBasicType("int8", 8, llvm::dwarf::DW_ATE_signed); - } else if (ty == builder->getInt32Ty()) { - return di_builder->createBasicType("int32", 32, llvm::dwarf::DW_ATE_signed); - } else if (ty->isPointerTy()) { - return di_builder->createPointerType( - getDebugType(builder, di_builder, ty->getPointerElementType()), - ty->getPrimitiveSizeInBits()); + } else if (ty_llvm == llvm::Type::getFloatTy(*ctx_)) { + return dbg_info_->di_builder_->createBasicType("float", 32, llvm::dwarf::DW_ATE_float); + } else if (ty_llvm == t_int8_) { + return dbg_info_->di_builder_->createBasicType("int8", 8, llvm::dwarf::DW_ATE_signed); + } else if (ty_llvm == t_int32_) { + return dbg_info_->di_builder_->createBasicType("int32", 32, llvm::dwarf::DW_ATE_signed); + } else if (ty_llvm->isPointerTy()) { + auto* ptr_type = ty_tir.as(); + ICHECK(ptr_type != nullptr) << "Got LLVM pointer type from non-pointer IR type: " << ty_tir; + Type elem_type = ptr_type->element_type; + return dbg_info_->di_builder_->createPointerType( + GetDebugType(elem_type, GetLLVMType(elem_type)), ty_llvm->getPrimitiveSizeInBits()); } else { std::string type_str; llvm::raw_string_ostream rso(type_str); - ty->print(rso); + ty_llvm->print(rso); LOG(FATAL) << "Unknown LLVM type:" << rso.str(); } return nullptr; diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h index e2c23f20117d2..eec38b122a0b8 100644 --- a/src/target/llvm/codegen_cpu.h +++ b/src/target/llvm/codegen_cpu.h @@ -190,10 +190,9 @@ class CodeGenCPU : public CodeGenLLVM { // Get the DWARF type corresponding to the LLVM type |ty|. The current API in practice only // generates |int32|, and |int8*|. - static llvm::DIType* getDebugType(IRBuilder* builder, llvm::DIBuilder* di_builder, - llvm::Type* ty); + llvm::DIType* GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm); // Adds the DWARF debug information for |function| to |dbg_info_|. - void AddDebugInformation(llvm::Function* function); + void AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm); }; } // namespace codegen From 1787cca3f90237dff001fba01ffdbaf9a510f886 Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Fri, 1 Jul 2022 15:24:52 -0700 Subject: [PATCH 043/111] [Relay] [PyTorch] Add aten::tril and aten::triu (#11890) * add trilu * update triu and tril; fix empty * fix lint --- python/tvm/relay/frontend/pytorch.py | 27 +++++++ tests/python/frontend/pytorch/test_forward.py | 81 +++++++++++++++++-- 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 123b0299839e0..cb5392fa16abe 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -319,6 +319,31 @@ def square(self, inputs, input_types): (dtype,) = input_types return _op.power(inputs[0], _expr.const(2, dtype)) + def tril(self, inputs, input_types): + data = inputs[0] + if len(inputs) == 2: + k_value = inputs[1] + else: + k_value = 0 + input_shape = self.infer_shape(data) + k1, k2 = input_shape[-2:] + k1 = k_value + 1 + diag_input = _op.zeros(input_shape, dtype=input_types[0]) + return _op.matrix_set_diag(data, diag_input, k=(k1, k2)) + + def triu(self, inputs, input_types): + data = inputs[0] + if len(inputs) == 2: + k_value = inputs[1] + else: + k_value = 0 + input_shape = self.infer_shape(data) + k1, k2 = input_shape[-2:] + k1 = (k1 * -1) - 1 + k2 = k_value - 1 + diag_input = _op.zeros(input_shape, dtype=input_types[0]) + return _op.matrix_set_diag(data, diag_input, k=(k1, k2)) + def arange(self, inputs, input_types): def _get_value(val, dtype): # dtype is a tvm dtype @@ -3328,6 +3353,8 @@ def create_convert_map(self): "aten::sqrt": self.make_unary("sqrt"), "aten::rsqrt": self.make_unary("rsqrt"), "aten::square": self.square, + "aten::tril": self.tril, + "aten::triu": self.triu, "aten::ceil": self.make_unary("ceil"), "aten::floor": self.make_unary("floor"), "aten::round": self.make_unary("round"), diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 4f42c183b66a8..80a5cd07f7b61 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -199,12 +199,21 @@ def visit(op): torch.cuda.empty_cache() -def verify_model_with_input(test_func, input_data, input_dict={}): +def verify_model_with_input( + test_func, + input_data, + *, + input_dict={}, + custom_convert_map={}, + rtol=1e-5, + atol=1e-5, + assert_shape_only=False, +): baseline_outputs = test_func(*input_data) trace = torch.jit.trace(test_func, [input.clone() for input in input_data]) input_names = ["input{}".format(idx) for idx, inp in enumerate(input_data)] input_shapes = list(zip(input_names, [inp.shape for inp in input_data])) - mod, params = relay.frontend.from_pytorch(trace, input_shapes, {}) + mod, params = relay.frontend.from_pytorch(trace, input_shapes, custom_convert_map) with tvm.transform.PassContext(opt_level=3): for target in ["llvm", "cuda"]: if not tvm.runtime.enabled(target): @@ -218,7 +227,8 @@ def verify_model_with_input(test_func, input_data, input_dict={}): compiled_output = relay_model.get_output(0).numpy() assert_shapes_match(baseline_outputs, compiled_output) - tvm.testing.assert_allclose(baseline_outputs, compiled_output, rtol=1e-5, atol=1e-5) + if assert_shape_only == False: + tvm.testing.assert_allclose(baseline_outputs, compiled_output, rtol=rtol, atol=atol) # Single operator tests @@ -1304,7 +1314,7 @@ def test_func(input_tensor, other_tensor): input_data = [torch.rand([2, 1, 10, 1, 10]), torch.rand([2, 1, 10, 10])] - verify_model_with_input(test_func, input_data, {"input0": input_data[0]}) + verify_model_with_input(test_func, input_data, input_dict={"input0": input_data[0]}) @tvm.testing.uses_gpu @@ -3423,6 +3433,64 @@ def forward(self, *args): verify_model(Neg1().float().eval(), input_data=input_data) +@tvm.testing.uses_gpu +def test_forward_tril(): + torch.set_grad_enabled(False) + + def test_func(input_data): + return torch.tril(input_data) + + input_data = torch.rand([3, 3]).float() + verify_model(test_func, input_data=input_data) + input_data = torch.rand([1, 3, 10, 10]).float() + verify_model(test_func, input_data=input_data) + + def test_func1(input_data): + return torch.tril(input_data, 1) + + input_data = torch.rand([3, 3]).float() + verify_model(test_func1, input_data=input_data) + input_data = torch.rand([1, 3, 10, 10]).float() + verify_model(test_func1, input_data=input_data) + + def test_func2(input_data): + return torch.tril(input_data, -1) + + input_data = torch.rand([3, 3]).float() + verify_model(test_func2, input_data=input_data) + input_data = torch.rand([1, 3, 10, 10]).float() + verify_model(test_func2, input_data=input_data) + + +@tvm.testing.uses_gpu +def test_forward_triu(): + torch.set_grad_enabled(False) + + def test_func(input_data): + return torch.triu(input_data) + + input_data = torch.rand([3, 3]).float() + verify_model(test_func, input_data=input_data) + input_data = torch.rand([1, 3, 10, 10]).float() + verify_model(test_func, input_data=input_data) + + def test_func1(input_data): + return torch.triu(input_data, 1) + + input_data = torch.rand([3, 3]).float() + verify_model(test_func1, input_data=input_data) + input_data = torch.rand([1, 3, 10, 10]).float() + verify_model(test_func1, input_data=input_data) + + def test_func2(input_data): + return torch.triu(input_data, -1) + + input_data = torch.rand([3, 3]).float() + verify_model(test_func2, input_data=input_data) + input_data = torch.rand([1, 3, 10, 10]).float() + verify_model(test_func2, input_data=input_data) + + @tvm.testing.uses_gpu def test_forward_where(): torch.set_grad_enabled(False) @@ -3817,15 +3885,14 @@ def test_empty(): def test_func(): return torch.empty([1, 3, 10, 10]) - verify_model_with_input(test_func, []) + verify_model_with_input(test_func, [], assert_shape_only=True) -@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11967") def test_empty_like(): def test_func(data): return torch.empty_like(data) - verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()]) + verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()], assert_shape_only=True) def test_forward_pretrained_bert_base_uncased(): From ce8f6d176decf5c5d1c4dba21d4b371529cb7f9f Mon Sep 17 00:00:00 2001 From: wrongtest Date: Sat, 2 Jul 2022 13:00:35 +0800 Subject: [PATCH 044/111] add missing narrow down of index within conditions (#11942) --- src/tir/transforms/narrow_datatype.cc | 58 +++++++++++++++---- .../test_tir_transform_narrow_datatype.py | 29 ++++++++++ 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc index 16ec86d01826f..047295180712a 100644 --- a/src/tir/transforms/narrow_datatype.cc +++ b/src/tir/transforms/narrow_datatype.cc @@ -264,6 +264,17 @@ class DataTypeRewriter : public StmtExprMutator { op->thread_binding, op->annotations); } + Stmt VisitStmt_(const IfThenElseNode* op) final { + IfThenElse updated = Downcast(StmtExprMutator::VisitStmt_(op)); + is_condition_ = true; + PrimExpr cond = VisitExpr(op->condition); + is_condition_ = false; + if (!cond.same_as(op->condition)) { + return std::move(IfThenElse(cond, updated->then_case, updated->else_case)); + } + return std::move(updated); + } + Stmt VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) { Stmt s = StmtExprMutator::VisitStmt_(op); @@ -393,8 +404,10 @@ class DataTypeRewriter : public StmtExprMutator { // a map from IterVar before rewrite to that after rewrite, // ensures one old IterVar maps to exactly one new IterVar std::unordered_map ivmap_; - // indicator of LoadNode::index and StoreNode::index + // indicator of index expr to rewrite bool is_index_{false}; + // indicator of condition + bool is_condition_{false}; // cached ops const Op& builtin_pow_ = Op::Get("tir.pow"); }; @@ -410,6 +423,23 @@ class DataTypeRewriter : public StmtExprMutator { } \ } +#define DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC) \ + PrimExpr DataTypeRewriter::VisitExpr_(const OP* op) { \ + bool is_index = is_index_; \ + bool rewrite = is_condition_ && op->a->dtype.is_int() && op->b->dtype.is_int(); \ + if (rewrite) { \ + is_index_ = true; \ + } \ + PrimExpr a = this->VisitExpr(op->a); \ + PrimExpr b = this->VisitExpr(op->b); \ + is_index_ = is_index; \ + if (a.same_as(op->a) && b.same_as(op->b)) { \ + return GetRef(op); \ + } else { \ + return FUNC(a, b); \ + } \ + } + DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+); DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(SubNode, operator-); DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MulNode, operator*); @@ -419,22 +449,28 @@ DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorDivNode, floordiv); DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorModNode, floormod); DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MinNode, min); DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MaxNode, max); -DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==); -DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=); -DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=); -DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<); // NOLINT(*) -DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>); // NOLINT(*) -DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=); +DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==); +DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=); +DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=); +DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<); // NOLINT(*) +DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>); // NOLINT(*) +DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=); PrimExpr DataTypeRewriter::VisitExpr_(const CallNode* op) { + // handle if_then_else condition + if (op->op.same_as(builtin::if_then_else())) { + bool is_condition = is_condition_; + is_condition_ = true; + PrimExpr cond = VisitExpr(op->args[0]); + is_condition_ = is_condition; + return if_then_else(cond, VisitExpr(op->args[1]), VisitExpr(op->args[2])); + } + PrimExpr e = StmtExprMutator::VisitExpr_(op); op = e.as(); ICHECK(op != nullptr) << "Expected type to be CallNode" << ", but get " << e->GetTypeKey(); - - if (op->op.same_as(builtin::if_then_else())) { - return if_then_else(op->args[0], op->args[1], op->args[2]); - } else if (op->op.same_as(builtin::shift_right())) { + if (op->op.same_as(builtin::shift_right())) { return op->args[0] >> op->args[1]; } else if (op->op.same_as(builtin::shift_left())) { return op->args[0] << op->args[1]; diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py index 5c69ddc412d90..d66b4ef5dd5b3 100644 --- a/tests/python/unittest/test_tir_transform_narrow_datatype.py +++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py @@ -305,6 +305,34 @@ def test_ramp_dtype_consistency(): lower_sch(s, [A], 32, extra_passes=[tvm.tir.transform.VectorizeLoop()]) +def test_condition(): + @T.prim_func + def before(A: T.Buffer[(128,), "float32"], B: T.Buffer[(130,), "float32"]): + for i, j in T.grid(T.int64(2), T.int64(65)): + if i * T.int64(65) + j >= T.int64(0) and i * T.int64(65) + j < T.int64(128): + A[i * T.int64(65) + j] = 0.0 + for i, j in T.grid(T.int64(2), T.int64(65)): + B[i * T.int64(65) + j] = T.if_then_else( + i * T.int64(65) + j >= T.int64(0) and i * T.int64(65) + j < T.int64(128), + A[i * T.int64(65) + j], + 0.0, + dtype="float32", + ) + + @T.prim_func + def expected_after(A: T.Buffer[128, "float32"], B: T.Buffer[130, "float32"]): + for i, j in T.grid(2, 65): + if i * 65 + j >= 0 and i * 65 + j < 128: + A[i * 65 + j] = T.float32(0) + for i, j in T.grid(2, 65): + B[i * 65 + j] = T.if_then_else( + i * 65 + j >= 0 and i * 65 + j < 128, A[i * 65 + j], T.float32(0), dtype="float32" + ) + + after = tvm.tir.transform.NarrowDataType(32)(tvm.IRModule.from_expr(before))["main"] + tvm.ir.assert_structural_equal(after, expected_after) + + if __name__ == "__main__": test_basic() test_thread_axis() @@ -315,3 +343,4 @@ def test_ramp_dtype_consistency(): test_relay_basic() test_relay_take() test_ramp_dtype_consistency() + test_condition() From 0e971869575df7e5b12381e4566a1a8fd98a4a77 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Sat, 2 Jul 2022 04:16:25 -0700 Subject: [PATCH 045/111] [MetaSchedule] Enhance AutoInline for Spatial Task (#11996) Previously, Auto-Inline on CPU will only inline according to strict conditions, for example, ordered index mapping. This is generally good practice to do so, but on the other hand, there is no much benefit to stop inlining only due to some restrictive conditions for pure spatial subgraphs. By doing so, we also save some search trials on pure spatial subgraphs so that more can be allocated to more important ones. --- .../schedule_rule/auto_inline.cc | 16 +++- ...meta_schedule_schedule_rule_auto_inline.py | 93 +++++++++++++++++++ 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc index 0cfe35298dd69..309f0a60aca02 100644 --- a/src/meta_schedule/schedule_rule/auto_inline.cc +++ b/src/meta_schedule/schedule_rule/auto_inline.cc @@ -31,6 +31,15 @@ enum class InlineType : int32_t { kInlineIntoProducer = 2, }; +bool IsInSpatialPrimFunc(const tir::Schedule& sch, const tir::StmtSRef& block_sref) { + using namespace tvm::tir; + const StmtSRefNode* sref = block_sref.get(); + for (; sref->parent != nullptr; sref = sref->parent) { + } + ICHECK(sref->stmt != nullptr && sref->stmt->IsInstance()); + return IsSpatialPrimFunc(GetRef(GetRootPrimFunc(sch->mod(), sref->stmt, nullptr))); +} + /*! \brief The rule that inlines spatial blocks if it satisfies some conditions. */ class AutoInlineNode : public ScheduleRuleNode { public: @@ -85,6 +94,7 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch, const tir::BlockRV& block_rv) { using namespace tvm::tir; StmtSRef block_sref = sch->GetSRef(block_rv); + bool is_pure_sptial = IsInSpatialPrimFunc(sch, block_sref); ScheduleState state = sch->state(); const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); BlockRealize realize = GetBlockRealize(state, block_sref); @@ -97,15 +107,15 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch, return InlineType::kInlineIntoConsumer; } // Cond 3. The block doesn't contain any disallowed operators - if (!disallow_op.empty() && HasOp(realize, disallow_op)) { + if (!is_pure_sptial && !disallow_op.empty() && HasOp(realize, disallow_op)) { return InlineType::kNoInline; } // Cond 4. The block doesn't have any if-then-else-like constructs - if (disallow_if_then_else && HasIfThenElse(realize)) { + if (!is_pure_sptial && disallow_if_then_else && HasIfThenElse(realize)) { return InlineType::kNoInline; } // Cond 5. The mapping from read indices to write indices are injective and ordered - if (require_injective || require_ordered) { + if (!is_pure_sptial && (require_injective || require_ordered)) { const BufferRegion& write_region = block->writes[0]; for (const BufferRegion& read_region : block->reads) { bool injective, ordered; diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py index 2a8a1e5fe12aa..a8ffa6ff9d3fa 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py @@ -240,6 +240,86 @@ def main(A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256) T_softmax_norm[i0_4, i1_1] = T.exp(A[i0_4, i1_1] - T_softmax_maxelem[i0_4], dtype="float32") / T_softmax_expsum[i0_4] +@tvm.script.ir_module +class BeforePureSpatial: + @T.prim_func + def main( + placeholder: T.Buffer[(1, 384), "int64"], + placeholder_1: T.Buffer[(30522, 768), "float32"], + placeholder_2: T.Buffer[(1, 384, 768), "float32"], + T_add: T.Buffer[(1, 384, 768), "float32"], + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + compile_engine_const = T.alloc_buffer([], dtype="int64") + T_less = T.alloc_buffer([1, 384], dtype="bool") + compile_engine_const_1 = T.alloc_buffer([], dtype="int64") + T_add_1 = T.alloc_buffer([1, 384], dtype="int64") + T_where = T.alloc_buffer([1, 384], dtype="int64") + T_take = T.alloc_buffer([1, 384, 768], dtype="float32") + with T.block("compile_engine_const"): + vi = T.axis.spatial(1, 0) + T.reads() + T.writes(compile_engine_const[()]) + compile_engine_const[()] = T.int64(0) + for i0, i1 in T.grid(1, 384): + with T.block("T_less"): + ax0, ax1 = T.axis.remap("SS", [i0, i1]) + T.reads(placeholder[ax0, ax1], compile_engine_const[()]) + T.writes(T_less[ax0, ax1]) + T_less[ax0, ax1] = placeholder[ax0, ax1] < compile_engine_const[()] + with T.block("compile_engine_const_1"): + vi = T.axis.spatial(1, 0) + T.reads() + T.writes(compile_engine_const_1[()]) + compile_engine_const_1[()] = T.int64(30522) + for i0, i1 in T.grid(1, 384): + with T.block("T_add"): + ax0, ax1 = T.axis.remap("SS", [i0, i1]) + T.reads(placeholder[ax0, ax1], compile_engine_const_1[()]) + T.writes(T_add_1[ax0, ax1]) + T_add_1[ax0, ax1] = placeholder[ax0, ax1] + compile_engine_const_1[()] + for i0, i1 in T.grid(1, 384): + with T.block("T_where"): + ax0, ax1 = T.axis.remap("SS", [i0, i1]) + T.reads(T_less[ax0, ax1], T_add_1[ax0, ax1], placeholder[ax0, ax1]) + T.writes(T_where[ax0, ax1]) + T_where[ax0, ax1] = T.Select( + T.cast(T_less[ax0, ax1], "int32") != 0, T_add_1[ax0, ax1], placeholder[ax0, ax1] + ) + for i0, i1, i2 in T.grid(1, 384, 768): + with T.block("T_take"): + ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2]) + T.reads( + placeholder_1[T.min(T.max(T.int64(0), T_where[ax0, ax1]), T.int64(30521)), ax2], + T_where[ax0, ax1], + ) + T.writes(T_take[ax0, ax1, ax2]) + T_take[ax0, ax1, ax2] = placeholder_1[ + T.min(T.max(T.int64(0), T_where[ax0, ax1]), T.int64(30521)), ax2 + ] + for i0, i1, i2 in T.grid(1, 384, 768): + with T.block("T_add_1"): + ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2]) + T.reads(T_take[ax0, ax1, ax2], placeholder_2[ax0, ax1, ax2]) + T.writes(T_add[ax0, ax1, ax2]) + T_add[ax0, ax1, ax2] = T_take[ax0, ax1, ax2] + placeholder_2[ax0, ax1, ax2] + + +@tvm.script.ir_module +class AfterPureSpatial: + @T.prim_func + def main(placeholder: T.Buffer[(1, 384), "int64"], placeholder_1: T.Buffer[(30522, 768), "float32"], placeholder_2: T.Buffer[(1, 384, 768), "float32"], T_add: T.Buffer[(1, 384, 768), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + for i0, i1, i2 in T.grid(1, 384, 768): + with T.block("T_add_1"): + ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2]) + T.reads(placeholder[ax0, ax1], placeholder_1[T.min(T.max(T.int64(0), placeholder[ax0, ax1]), T.int64(30521)) : T.min(T.max(T.int64(0), placeholder[ax0, ax1] + T.int64(30522)), T.int64(30521)) + T.int64(1), ax2], placeholder_2[ax0, ax1, ax2]) + T.writes(T_add[ax0, ax1, ax2]) + T_add[ax0, ax1, ax2] = placeholder_1[T.min(T.max(T.int64(0), T.Select(T.cast(placeholder[ax0, ax1] < T.int64(0), "int32") != 0, placeholder[ax0, ax1] + T.int64(30522), placeholder[ax0, ax1])), T.int64(30521)), ax2] + placeholder_2[ax0, ax1, ax2] + # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks # fmt: on @@ -291,7 +371,20 @@ def test_inline_into_multiple_consumers(): tvm.ir.assert_structural_equal(lhs=space.mod, rhs=SoftmaxAfterInline) +def test_inline_pure_spatial(): + mod = BeforePureSpatial + target = Target("llvm") + ctx = _create_context( + mod=mod, + target=target, + rule=auto_inline(target=target), + ) + (space,) = ctx.space_generator.generate_design_space(mod=mod) + tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial) + + if __name__ == "__main__": test_inline_consumer_chain() test_inline_into_cache() test_inline_into_multiple_consumers() + test_inline_pure_spatial() From 6642c6e8b05534d3c08f8fa2969c31d553aa8476 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sat, 2 Jul 2022 16:01:58 -0700 Subject: [PATCH 046/111] [COMMUNITY] Hongyi Jin -> Reviewer (#11998) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 95e006513db99..e3b3082040393 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -115,6 +115,7 @@ We do encourage everyone to work anything they are interested in. - [Chenfan Jia](https://github.com/jcf94): @jcf94 - [Hua Jiang](https://github.com/huajsj): @huajsj - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang +- [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - [Elen Kalda](https://github.com/ekalda): @ekalda - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame From f8186d8c7d3e4679a6dfd83d17521f20bfb3ca42 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Sun, 3 Jul 2022 13:16:18 -0700 Subject: [PATCH 047/111] [TIR] Add sugar method `Schedule.work_on` (#11999) This PR introduces `Schedule.work_on`, which instructs `Schedule.get_block` to find the correct PrimFunc to retrieve from without having to specify `func_name` in every time if the PrimFunc's name is not `main`. --- include/tvm/tir/schedule/schedule.h | 24 ++++++++++- python/tvm/tir/schedule/schedule.py | 25 ++++++++++- src/meta_schedule/arg_info.cc | 41 ++++++++++++++++++ src/meta_schedule/mutator/mutate_parallel.cc | 3 +- src/meta_schedule/utils.h | 42 ------------------- src/tir/schedule/analysis.h | 9 ++++ src/tir/schedule/analysis/analysis.cc | 41 ++++++++++++++++++ src/tir/schedule/concrete_schedule.cc | 25 ++++++++++- src/tir/schedule/concrete_schedule.h | 8 +++- src/tir/schedule/primitive.h | 4 +- src/tir/schedule/primitive/get_block_loop.cc | 4 +- src/tir/schedule/schedule.cc | 2 + src/tir/schedule/traced_schedule.cc | 21 +++++++++- src/tir/schedule/traced_schedule.h | 2 +- .../unittest/test_tir_schedule_utilities.py | 32 +++++++++++++- 15 files changed, 225 insertions(+), 58 deletions(-) diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h index d95a9d4e7e5eb..8e160c61328c3 100644 --- a/include/tvm/tir/schedule/schedule.h +++ b/include/tvm/tir/schedule/schedule.h @@ -115,6 +115,21 @@ class ScheduleNode : public runtime::Object { virtual ScheduleState state() const = 0; /*! \return The internally maintained trace of scheduling program execution */ virtual Optional trace() const = 0; + /*! + * \brief Instruct the schedule to work on a function in the IRModule. + * + * By default, the schedule works on the function with the name "main", or the only function in + * the IRModule if there is only one. If there is multiple functions in the IRModule, and none of + * their names are "main", users will have to call this method to explicitly specify which + * function to work on. + * + * This sugar function will guide the `GetBlock` method if its `func_name` is not specified. + * + * \param func_name The name of the function to be working on + * + * \sa GetBlock + */ + virtual void WorkOn(const String& func_name) = 0; /*! * \brief Returns a copy of the schedule, including both its state and its symbol table, * guaranteeing that @@ -231,12 +246,19 @@ class ScheduleNode : public runtime::Object { /******** Schedule: Get blocks & loops ********/ /*! * \brief Retrieve a block in a specific function with its name + * + * By default, if `func_name` is not specified, the schedule will search for the block in the + * function that is currently being "worked on". To switch the function to be worked on, use + * `WorkOn` before calling this method. + * * \param name The name of the block to be retrieved * \param func_name The name of the function * \return The block retrieved * \note Indexing error is raised if 0 or multiple blocks exist with the specific name + * + * \sa WorkOn */ - virtual BlockRV GetBlock(const String& name, const String& func_name = "main") = 0; + virtual BlockRV GetBlock(const String& name, const Optional& func_name = NullOpt) = 0; /*! * \brief Get the parent loops of the block in its scope, from outer to inner * \param block_rv The query block diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index 7a1e244604b7d..28bdf63872d98 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -186,6 +186,23 @@ def trace(self) -> Optional[Trace]: """Returns the internally maintained trace of scheduling program execution""" return _ffi_api.ScheduleGetTrace(self) # type: ignore # pylint: disable=no-member + def work_on(self, func_name: str) -> None: + """Instruct the schedule to work on a function in the IRModule. + + By default, the schedule works on the function with the name "main", or the only function in + the IRModule if there is only one. If there is multiple functions in the IRModule, and none + of their names are "main", users will have to call this method to explicitly specify which + function to work on. + + This sugar function will guide the `GetBlock` method if its `func_name` is not specified. + + Parameters + ---------- + func_name : str + The name of the function to work on. + """ + _ffi_api.ScheduleWorkOn(self, func_name) # type: ignore # pylint: disable=no-member + def copy(self) -> "Schedule": """Returns a copy of the schedule, including both the state and the symbol table, * guaranteeing that @@ -403,15 +420,19 @@ def sample_compute_location( def get_block( self, name: str, - func_name: str = "main", + func_name: Optional[str] = None, ) -> BlockRV: """Retrieve a block in a specific function with its name + By default, if `func_name` is not specified, the schedule will search for the block in the + function that is currently being "worked on". To switch the function to be worked on, use + `work_on` before calling this method. + Parameters ---------- name : str The name of the block - func_name : str = "main" + func_name : Optional[str] = None The name of the function Returns diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc index 672df86deb9d5..21de9d719d00d 100644 --- a/src/meta_schedule/arg_info.cc +++ b/src/meta_schedule/arg_info.cc @@ -21,6 +21,47 @@ namespace tvm { namespace meta_schedule { +/*! + * \brief Find the entry function of the given IRModule, i.e, functions marked by + * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc. + * \param mod The IRModule to find the entry function. + * \return The entry function. + */ +inline tir::PrimFunc FindEntryFunc(const IRModule& mod) { + // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc` + int num_prim_func = 0; + const tir::PrimFuncNode* main_func = nullptr; + const tir::PrimFuncNode* last_func = nullptr; + for (const auto& kv : mod->functions) { + GlobalVar gv = kv.first; + BaseFunc base_func = kv.second; + if (const auto* func = base_func.as()) { + last_func = func; + if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) { + return GetRef(func); + } + if (gv->name_hint == "main") { + main_func = func; + } + ++num_prim_func; + } + } + // Priority 2: PrimFunc whose name is `main` + if (main_func != nullptr) { + return GetRef(main_func); + } + // Priority 3: The only PrimFunc in the IRModule + if (num_prim_func == 0) { + LOG(FATAL) << "ValueError: Cannot find any PrimFunc in the given IRModule: " + << tir::AsTVMScript(mod); + } + if (num_prim_func > 1) { + LOG(FATAL) << "ValueError: Multiple PrimFuncs exist in the IRModule, but none of them are " + "annotated with `kIsEntryFunc`, i.e. `tir.is_entry_func`" + << tir::AsTVMScript(mod); + } + return GetRef(last_func); +} /******** ArgInfo ********/ ArgInfo ArgInfo::FromJSON(const ObjectRef& json_obj) { diff --git a/src/meta_schedule/mutator/mutate_parallel.cc b/src/meta_schedule/mutator/mutate_parallel.cc index 7c973879f2cc2..5b7fe7f5148db 100644 --- a/src/meta_schedule/mutator/mutate_parallel.cc +++ b/src/meta_schedule/mutator/mutate_parallel.cc @@ -79,7 +79,8 @@ const BlockRVNode* GetInstGetBlockOutput(const Instruction& inst) { std::vector> AnalyzeParallel(const ScheduleState& self, const String& block_name, const String& func_name, int64_t limit) { - Array block_srefs = tir::GetBlocks(self, block_name, func_name); + Array block_srefs = + tir::GetBlocks(self, block_name, self->mod->GetGlobalVar(func_name)); ICHECK_EQ(block_srefs.size(), 1); const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_srefs[0]); ScopeBlockLoopInfo info = GetScopeBlockLoopInfo(GetRef(block)); diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index ca696da71e007..b5cb73c26e001 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -174,48 +174,6 @@ inline String SHash2Hex(const ObjectRef& obj) { return os.str(); } -/*! - * \brief Find the entry function of the given IRModule, i.e, functions marked by - * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc. - * \param mod The IRModule to find the entry function. - * \return The entry function. - */ -inline tir::PrimFunc FindEntryFunc(const IRModule& mod) { - // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc` - int num_prim_func = 0; - const tir::PrimFuncNode* main_func = nullptr; - const tir::PrimFuncNode* last_func = nullptr; - for (const auto& kv : mod->functions) { - GlobalVar gv = kv.first; - BaseFunc base_func = kv.second; - if (const auto* func = base_func.as()) { - last_func = func; - if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) { - return GetRef(func); - } - if (gv->name_hint == "main") { - main_func = func; - } - ++num_prim_func; - } - } - // Priority 2: PrimFunc whose name is `main` - if (main_func != nullptr) { - return GetRef(main_func); - } - // Priority 3: The only PrimFunc in the IRModule - if (num_prim_func == 0) { - LOG(FATAL) << "ValueError: Cannot find any PrimFunc in the given IRModule: " - << tir::AsTVMScript(mod); - } - if (num_prim_func > 1) { - LOG(FATAL) << "ValueError: Multiple PrimFuncs exist in the IRModule, but none of them are " - "annotated with `kIsEntryFunc`, i.e. `tir.is_entry_func`" - << tir::AsTVMScript(mod); - } - return GetRef(last_func); -} - /*! * \brief Fork a random state into another, i.e. PRNG splitting. * The given random state is also mutated. diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h index b30cef829f1ea..317b3625f0b66 100644 --- a/src/tir/schedule/analysis.h +++ b/src/tir/schedule/analysis.h @@ -71,6 +71,15 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl */ StmtSRef GetSRefTreeRoot(const StmtSRef& sref); +/*! + * \brief Find the entry function of the given IRModule, i.e, functions marked by + * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc. + * \param mod The IRModule to find the entry function. + * \param result_g_var The result GlobalVar of the entry function. + * \return The entry function. + */ +const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var); + /******** Scope ********/ /*! * \brief Checks if scope the specified sref is in is a stage-pipeline and return it diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index 3ee1ed28b8571..ac73ac3ce2c19 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -49,6 +49,47 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl throw; } +const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var) { + GlobalVar result = NullValue(); + // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc` + int num_prim_func = 0; + const tir::PrimFuncNode* main_func = nullptr; + const tir::PrimFuncNode* last_func = nullptr; + for (const auto& kv : mod->functions) { + GlobalVar gv = kv.first; + BaseFunc base_func = kv.second; + if (const auto* func = base_func.as()) { + last_func = func; + if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) { + if (result_g_var != nullptr) { + *result_g_var = gv; + } + return func; + } + if (gv->name_hint == "main") { + main_func = func; + result = gv; + } + ++num_prim_func; + } + } + // Priority 2: PrimFunc whose name is `main` + if (main_func != nullptr) { + if (result_g_var != nullptr) { + *result_g_var = result; + } + return main_func; + } + // Priority 3: The only PrimFunc in the IRModule + if (num_prim_func == 1) { + if (result_g_var != nullptr) { + *result_g_var = result; + } + return last_func; + } + return nullptr; +} + /******** Scope ********/ StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref, diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index b2f48753b5554..c19735025ddc4 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -31,6 +31,12 @@ Schedule Schedule::Concrete(IRModule mod, support::LinearCongruentialEngine::TRa n->symbol_table_ = {}; n->analyzer_ = std::make_unique(); n->Seed(seed); + GlobalVar gv = NullValue(); + if (FindEntryFunc(mod, &gv) != nullptr) { + n->func_working_on_ = gv; + } else { + n->func_working_on_ = NullOpt; + } return Schedule(std::move(n)); } @@ -177,6 +183,10 @@ class ScheduleCopier { std::unordered_map old2new_; }; +void ConcreteScheduleNode::WorkOn(const String& func_name) { + this->func_working_on_ = this->state_->mod->GetGlobalVar(func_name); +} + void ConcreteScheduleNode::Copy(ScheduleState* new_state, TSymbolTable* new_symbol_table) const { ScheduleCopier::Copy(this, new_state, new_symbol_table); new_state->get()->DebugVerify(); @@ -184,6 +194,7 @@ void ConcreteScheduleNode::Copy(ScheduleState* new_state, TSymbolTable* new_symb Schedule ConcreteScheduleNode::Copy() { ObjectPtr n = make_object(); + n->func_working_on_ = this->func_working_on_; n->error_render_level_ = this->error_render_level_; ConcreteScheduleNode::Copy(&n->state_, &n->symbol_table_); n->analyzer_ = std::make_unique(); // new analyzer needed because it is stateful @@ -251,7 +262,7 @@ LoopRV ConcreteScheduleNode::SampleComputeLocation(const BlockRV& block_rv, /******** Schedule: Get blocks & loops ********/ -BlockRV ConcreteScheduleNode::GetBlock(const String& name, const String& func_name) { +BlockRV ConcreteScheduleNode::GetBlock(const String& name, const Optional& func_name) { class NotSingleResult : public ScheduleError { public: explicit NotSingleResult(String name, IRModule mod, const Array& blocks) @@ -286,7 +297,17 @@ BlockRV ConcreteScheduleNode::GetBlock(const String& name, const String& func_na IRModule mod_; Array blocks_; }; - Array blocks = tir::GetBlocks(this->state_, name, func_name); + GlobalVar gv = NullValue(); + if (func_name.defined()) { + gv = state_->mod->GetGlobalVar(func_name.value()); + } else if (func_working_on_.defined()) { + gv = this->func_working_on_.value(); + } else { + LOG(FATAL) << "ValueError: `get_block` does not know which function to be working on. Please " + "specify the function name explicitly, or call `work_on` to specify the function " + "before using `get_block`."; + } + Array blocks = tir::GetBlocks(this->state_, name, gv); if (blocks.size() != 1) { TVM_TIR_SCHEDULE_BEGIN(); throw NotSingleResult(name, this->state_->mod, blocks); diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h index dfbacb530a36b..feea310bd7af5 100644 --- a/src/tir/schedule/concrete_schedule.h +++ b/src/tir/schedule/concrete_schedule.h @@ -38,6 +38,8 @@ class ConcreteScheduleNode : public ScheduleNode { protected: /*! \brief The internal state of scheduling */ ScheduleState state_; + /*! \brief The function to be worked on. */ + Optional func_working_on_; /*! \brief The level of error rendering */ ScheduleErrorRenderLevel error_render_level_; /*! \brief A symbol table that maps random variables to concrete StmtSRef/Integers */ @@ -50,10 +52,11 @@ class ConcreteScheduleNode : public ScheduleNode { public: void VisitAttrs(tvm::AttrVisitor* v) { // `state_` is not visited + // `func_working_on_` is not visited // `error_render_level_` is not visited // `symbol_table_` is not visited // `analyzer_` is not visited - // `rand_state_` is not visited + // `rgnd_state_` is not visited } virtual ~ConcreteScheduleNode() = default; @@ -61,6 +64,7 @@ class ConcreteScheduleNode : public ScheduleNode { public: ScheduleState state() const final { return state_; } Optional trace() const override { return NullOpt; } + void WorkOn(const String& func_name) final; Schedule Copy() override; void Seed(support::LinearCongruentialEngine::TRandState seed) final; support::LinearCongruentialEngine::TRandState ForkSeed() final; @@ -89,7 +93,7 @@ class ConcreteScheduleNode : public ScheduleNode { LoopRV SampleComputeLocation(const BlockRV& block_rv, Optional decision = NullOpt) override; /******** Schedule: Get blocks & loops ********/ - BlockRV GetBlock(const String& name, const String& func_name = "main") override; + BlockRV GetBlock(const String& name, const Optional& func_name) override; Array GetLoops(const BlockRV& block_rv) override; Array GetChildBlocks(const BlockRV& block_rv) override; Array GetChildBlocks(const LoopRV& loop_rv) override; diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h index 212571df10275..608368fbb31fb 100644 --- a/src/tir/schedule/primitive.h +++ b/src/tir/schedule/primitive.h @@ -116,10 +116,10 @@ TVM_DLL tir::StmtSRef SampleComputeLocation( * \brief Retrieves blocks in a specific function with its name * \param self The schedule state * \param name The name of the blocks to be retrieved - * \param func_name The name of the function + * \param gvar The function to be retrieved * \return A list of blocks with the specific name */ -Array GetBlocks(const ScheduleState& self, const String& name, const String& func_name); +Array GetBlocks(const ScheduleState& self, const String& name, const GlobalVar& gv); /*! * \brief Gets the parent loops of the block in its scope, from outer to inner * \param self The schedule state diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc index a13e525157086..746918ac4e34b 100644 --- a/src/tir/schedule/primitive/get_block_loop.cc +++ b/src/tir/schedule/primitive/get_block_loop.cc @@ -21,7 +21,7 @@ namespace tvm { namespace tir { -Array GetBlocks(const ScheduleState& self, const String& name, const String& func_name) { +Array GetBlocks(const ScheduleState& self, const String& name, const GlobalVar& gv) { struct Finder : public StmtVisitor { explicit Finder(const ScheduleState& self, const String& name) : self_(self), name_(name) {} @@ -39,7 +39,7 @@ Array GetBlocks(const ScheduleState& self, const String& name, const S Array results_; }; - BaseFunc func = self->mod->Lookup(func_name); + BaseFunc func = self->mod->Lookup(gv); const auto* prim_func = TVM_TYPE_AS(prim_func, func, PrimFuncNode); Finder finder(self, name); finder(prim_func->body); diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc index 372d94a15025b..e386061ebfbd0 100644 --- a/src/tir/schedule/schedule.cc +++ b/src/tir/schedule/schedule.cc @@ -56,6 +56,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSeed") // .set_body_method(&ScheduleNode::Seed); TVM_REGISTER_GLOBAL("tir.schedule.ScheduleForkSeed") // .set_body_method(&ScheduleNode::ForkSeed); +TVM_REGISTER_GLOBAL("tir.schedule.ScheduleWorkOn") // + .set_body_method(&ScheduleNode::WorkOn); /**************** (FFI) Constructor ****************/ diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc index 733b5d872f937..93e4c984a41b7 100644 --- a/src/tir/schedule/traced_schedule.cc +++ b/src/tir/schedule/traced_schedule.cc @@ -30,6 +30,12 @@ Schedule Schedule::Traced(IRModule mod, support::LinearCongruentialEngine::TRand n->analyzer_ = std::make_unique(); n->trace_ = Trace(); n->Seed(seed); + GlobalVar gv = NullValue(); + if (FindEntryFunc(mod, &gv) != nullptr) { + n->func_working_on_ = gv; + } else { + n->func_working_on_ = NullOpt; + } return Schedule(std::move(n)); } @@ -37,6 +43,7 @@ Schedule TracedScheduleNode::Copy() { ObjectPtr n = make_object(); n->error_render_level_ = this->error_render_level_; ConcreteScheduleNode::Copy(&n->state_, &n->symbol_table_); + n->func_working_on_ = this->func_working_on_; n->analyzer_ = std::make_unique(); // new analyzer needed because it is stateful n->rand_state_ = ForkSeed(); n->trace_ = Trace(this->trace_->insts, this->trace_->decisions); @@ -90,13 +97,23 @@ LoopRV TracedScheduleNode::SampleComputeLocation(const BlockRV& block_rv, /******** Schedule: Get blocks & loops ********/ -BlockRV TracedScheduleNode::GetBlock(const String& name, const String& func_name) { +BlockRV TracedScheduleNode::GetBlock(const String& name, const Optional& func_name) { + GlobalVar gv = NullValue(); + if (func_name.defined()) { + gv = state_->mod->GetGlobalVar(func_name.value()); + } else if (func_working_on_.defined()) { + gv = this->func_working_on_.value(); + } else { + LOG(FATAL) << "ValueError: `get_block` does not know which function to be working on. Please " + "specify the function name explicitly, or call `work_on` to specify the function " + "before using `get_block`."; + } BlockRV result = ConcreteScheduleNode::GetBlock(name, func_name); static const InstructionKind& kind = InstructionKind::Get("GetBlock"); trace_->Append(/*inst=*/Instruction(/*kind=*/kind, // /*inputs=*/{}, - /*attrs=*/{name, func_name}, + /*attrs=*/{name, gv->name_hint}, /*outputs=*/{result})); return result; } diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h index 178026d9eaf85..f6405d77a195f 100644 --- a/src/tir/schedule/traced_schedule.h +++ b/src/tir/schedule/traced_schedule.h @@ -53,7 +53,7 @@ class TracedScheduleNode : public ConcreteScheduleNode { Optional> decision = NullOpt) final; LoopRV SampleComputeLocation(const BlockRV& block_rv, Optional decision = NullOpt) final; /******** Schedule: Get blocks & loops ********/ - BlockRV GetBlock(const String& name, const String& func_name = "main") final; + BlockRV GetBlock(const String& name, const Optional& func_name) final; Array GetLoops(const BlockRV& block_rv) final; Array GetChildBlocks(const BlockRV& block_rv) final; Array GetChildBlocks(const LoopRV& loop_rv) final; diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py index b7517aab7cd37..c479555590d2c 100644 --- a/tests/python/unittest/test_tir_schedule_utilities.py +++ b/tests/python/unittest/test_tir_schedule_utilities.py @@ -20,7 +20,6 @@ import pytest import tvm import tvm.testing - from tvm import tir from tvm.ir import IRModule from tvm.script import tir as T @@ -102,6 +101,29 @@ def matmul_relu_ann2(a: T.handle, b: T.handle, d: T.handle) -> None: D[vi, vj] = T.max(C[vi, vj], 0.0) +@tvm.script.ir_module +class ModuleWithMultipleFuncs: + @T.prim_func + def vector_add( + A: T.Buffer[128, "float32"], + B: T.Buffer[128, "float32"], + ) -> None: + for i in range(128): + with T.block("init"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + @T.prim_func + def vector_add_2( + A: T.Buffer[128, "float32"], + B: T.Buffer[128, "float32"], + ) -> None: + for i in range(128): + with T.block("init"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + # pylint: enable=no-member,invalid-name,unused-variable use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True}) @@ -133,6 +155,14 @@ def test_tir_schedule_get_block(): assert block.same_as(matmul.body.block.body.body.body[1].body.block) +def test_tir_schedule_work_on(): + sch = tir.Schedule(ModuleWithMultipleFuncs, debug_mask="all") + with pytest.raises(ValueError, match="does not know which function to be working on"): + sch.get_block(name="init") + sch.work_on(func_name="vector_add") + sch.get_block(name="init") + + def test_tir_schedule_get_loops(use_block_name): # Tests: # - Schedule.get_loops From 5efe8b0bfdff4c9939185a7581dc77e23cbcb6d5 Mon Sep 17 00:00:00 2001 From: Ivy Zhang Date: Mon, 4 Jul 2022 14:06:00 +0800 Subject: [PATCH 048/111] Enhancement for fold_scale_axis and dnnl_json_runtime (#11815) * enhance WA in dnnl_convolution, support crop for tensor with mismatched groups and OC * add missing param checks for conv2d, conv3d * fix lint --- src/relay/transforms/fold_scale_axis.cc | 8 ++++++++ src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 10 ++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc index f4f05badec374..7cc15a8f93edb 100644 --- a/src/relay/transforms/fold_scale_axis.cc +++ b/src/relay/transforms/fold_scale_axis.cc @@ -588,9 +588,11 @@ Expr ConvForwardRewrite(const Call& ref_call, const ATTRS* param, const Array PreConvForwardPrep(const Call& call, const Message& out_message) { if (backend::IsOp(call.as(), "nn.conv2d")) { const auto* param = call->attrs.as(); + ICHECK(param != nullptr); return ConvForwardPrep(call, param, out_message); } const auto* param = call->attrs.as(); + ICHECK(param != nullptr); return ConvForwardPrep(call, param, out_message); } @@ -598,9 +600,11 @@ Expr PreConvForwardRewrite(const Call& ref_call, const Array& new_args, const Message& message) { if (backend::IsOp(ref_call.as(), "nn.conv2d")) { const auto* param = ref_call->attrs.as(); + ICHECK(param != nullptr); return ConvForwardRewrite(ref_call, param, new_args, message); } const auto* param = ref_call->attrs.as(); + ICHECK(param != nullptr); return ConvForwardRewrite(ref_call, param, new_args, message); } @@ -1040,9 +1044,11 @@ Expr ConvBackwardTransform(const Call& call, const ATTRS* param, const Message& Message PreConvBackwardPrep(const Call& call, const Array& in_messages) { if (backend::IsOp(call.as(), "nn.conv2d")) { const auto* param = call->attrs.as(); + ICHECK(param != nullptr); return ConvBackwardPrep(call, param, in_messages); } const auto* param = call->attrs.as(); + ICHECK(param != nullptr); return ConvBackwardPrep(call, param, in_messages); } @@ -1050,9 +1056,11 @@ Expr PreConvBackwardTransform(const Call& call, const Message& message, const Ex const BackwardTransformer& transformer) { if (backend::IsOp(call.as(), "nn.conv2d")) { const auto* param = call->attrs.as(); + ICHECK(param != nullptr); return ConvBackwardTransform(call, param, message, scale, transformer); } const auto* param = call->attrs.as(); + ICHECK(param != nullptr); return ConvBackwardTransform(call, param, message, scale, transformer); } diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc index a4239186b4b33..a46f170fea949 100644 --- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc +++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc @@ -318,9 +318,15 @@ class DNNLJSONRuntime : public JSONRuntimeBase { // Let's try to compensate it for weight tensor. Weight IC should match with source IC. // Example src: [1, 3, 224, 224] with layout NCHW // wgh: [16, 3, 3, 3] with layout OIHW2i8o -> [2, 2, 3, 3, 2, 8] - if (wgh_tr.dims()[2] != src_tr.dims()[1] / groups) { + // Similarly, Weight OC should match with destination OC. + // Example dst: [1, 1000, 7, 7] with layout NCHW + // wgh: [1000, 1024, 1, 1] with layout OIHW48o -> [21, 1024, 1, 1, 48] + if (wgh_tr.dims()[0] != groups || wgh_tr.dims()[1] != dst_tr.dims()[1] / groups || + wgh_tr.dims()[2] != src_tr.dims()[1] / groups) { auto wgh_croped_dims = wgh_tr.dims(); - wgh_croped_dims[2] = src_tr.dims()[1]; + wgh_croped_dims[0] = groups; + wgh_croped_dims[1] = dst_tr.dims()[1] / groups; // wgh_OC = dst_OC / groups + wgh_croped_dims[2] = src_tr.dims()[1] / groups; // wgh_IC = src_IC / groups auto zero_offset = dnnl::memory::dims(wgh_tr.dims().size(), 0); wgh_tr = wgh_tr.Crop(wgh_croped_dims, zero_offset); } From ef08c36294dc7c90f9d4536948507eca515012bd Mon Sep 17 00:00:00 2001 From: Andrey Malyshev Date: Tue, 5 Jul 2022 07:04:02 +0300 Subject: [PATCH 049/111] [Adreno] Modify default AutoTVM params for conv2d (#12005) --- python/tvm/topi/adreno/conv2d_nchw.py | 3 ++ python/tvm/topi/adreno/conv2d_nhwc.py | 5 +++ .../tvm/topi/adreno/depthwise_conv2d_nchw.py | 16 ++++--- .../tvm/topi/adreno/depthwise_conv2d_nhwc.py | 15 ++++--- python/tvm/topi/adreno/utils.py | 44 +++++++++++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py index 96368b3e57c2d..2a8f6028b755c 100644 --- a/python/tvm/topi/adreno/conv2d_nchw.py +++ b/python/tvm/topi/adreno/conv2d_nchw.py @@ -28,6 +28,7 @@ expand_spatial_dimensions, add_pad, bind_data_copy, + get_default_conv2d_config, ) @@ -264,6 +265,8 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) + if cfg.is_fallback: + get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3]) ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py index d40f813fdb0f0..388f606ecb545 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc.py +++ b/python/tvm/topi/adreno/conv2d_nhwc.py @@ -29,6 +29,7 @@ add_pad, bind_data_copy, get_texture_storage, + get_default_conv2d_config, ) @@ -261,6 +262,10 @@ def schedule_conv2d_NHWC(cfg, s, output): cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) + if cfg.is_fallback: + get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2]) + ##### space definition end ##### + pad_data, kernel = s[conv].op.input_tensors if ( isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py index 298bd11e00a7a..a11c3f3d36b81 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py @@ -28,6 +28,8 @@ expand_spatial_dimensions, add_pad, bind_data_copy, + get_texture_storage, + get_default_conv2d_config, ) @@ -240,6 +242,9 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output): cfg.define_split("tile_rx", rx, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) + + if cfg.is_fallback: + get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3]) ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors @@ -260,11 +265,12 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output): if latest_blocked == latest and output != latest: s[output].compute_inline() - # create cache stage - AT = s.cache_read(pad_data, "global.texture", [conv]) - WT = s.cache_read(kernel, "global.texture-weight", [conv]) - bind_data_copy(s[AT]) - bind_data_copy(s[WT]) + if autotvm.GLOBAL_SCOPE.in_tuning or len(latest.op.axis) == 4: + # create cache stage for tuning only or in case of 4d case + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) # tile and bind spatial axes n, fc, y, x, fb = s[latest_blocked].op.axis diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py index b8a978d3c2042..117daf825d06b 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py @@ -29,6 +29,7 @@ add_pad, bind_data_copy, get_texture_storage, + get_default_conv2d_config, ) @@ -235,6 +236,9 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output): cfg.define_split("tile_rx", rx, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) + + if cfg.is_fallback: + get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2]) ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors @@ -255,11 +259,12 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output): if latest_blocked == latest and output != latest: s[output].compute_inline() - # create cache stage - AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) - WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) - bind_data_copy(s[AT]) - bind_data_copy(s[WT]) + if autotvm.GLOBAL_SCOPE.in_tuning or len(latest.op.axis) == 4: + # create cache stage for tuning only or in case of 4d case + AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) + bind_data_copy(s[AT]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) # tile and bind spatial axes n, y, x, fc, fb = s[latest_blocked].op.axis diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py index 78a992e56a0f9..ea19e7d77dad2 100644 --- a/python/tvm/topi/adreno/utils.py +++ b/python/tvm/topi/adreno/utils.py @@ -22,6 +22,7 @@ from tvm import te from tvm.topi.utils import simplify from tvm.topi import nn +from tvm.autotvm.task.space import SplitEntity from ..utils import get_const_tuple @@ -575,3 +576,46 @@ def infer_tile_size(data, layout): if H % 8 == 0: return 4 return 2 + + +def get_default_conv2d_config(cfg, fc, y, x): + """Defines conv2d default parameters for split axis for Adreno conv2d and depthwise conv2d""" + # look for vthread params: + vy = 1 + for n in range(5, 0, -1): + if y % n == 0: + vy = n + break + + vx = 1 + for n in range(5, 0, -1): + if x % n == 0 and vy * n < 9: + vx = n + break + + y = y // vy + x = x // vx + + tfc = 1 + for n in range(64, 0, -1): + if fc % n == 0: + tfc = n + break + ty = 1 + for n in range(16, 0, -1): + if y % n == 0 and tfc * n <= 512: + ty = n + break + tx = 1 + for n in range(16, 0, -1): + if x % n == 0 and tfc * ty * n <= 512: + tx = n + break + + fc = fc // tfc + y = y // ty + x = x // tx + + cfg["tile_fc"] = SplitEntity([fc, 1, tfc]) + cfg["tile_y"] = SplitEntity([y, vy, ty]) + cfg["tile_x"] = SplitEntity([x, vx, tx]) From 83b310d5a41b92a857c17d25a0a9b0546441586a Mon Sep 17 00:00:00 2001 From: Black <823036806@qq.com> Date: Tue, 5 Jul 2022 14:17:16 +0800 Subject: [PATCH 050/111] [Frontend][TFLite] Add support for NonMaxSuppressionV5 op (#12003) * add nms_v5 op for TFLite * add a test for the TFLite nms_v5 op --- python/tvm/relay/frontend/tflite.py | 64 ++++++++++++++++++++ tests/python/frontend/tflite/test_forward.py | 40 ++++++++++++ 2 files changed, 104 insertions(+) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index 2a9d66acff07a..d7ec441e0eb40 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -176,6 +176,7 @@ def __init__(self, model, subgraph, exp_tab): "UNIDIRECTIONAL_SEQUENCE_LSTM": self.convert_unidirectional_sequence_lstm, "WHERE": self.convert_select, "ZEROS_LIKE": self.convert_zeros_like, + "NON_MAX_SUPPRESSION_V5": self.convert_nms_v5, } def check_unsupported_ops(self): @@ -3347,6 +3348,69 @@ def convert_detection_postprocess(self, op): ret = _expr.TupleWrapper(_expr.Tuple([boxes, cls_ids, scores, valid_count]), size=4) return ret + def convert_nms_v5(self, op): + """Convert TFLite NonMaxSuppressionV5""" + # https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/non-max-suppression-v5 + + input_tensors = self.get_input_tensors(op) + assert len(input_tensors) == 6, "input tensor length should be 6" + boxes = self.get_expr(input_tensors[0].tensor_idx) + scores = self.get_expr(input_tensors[1].tensor_idx) + max_output_size = self.get_tensor_value(input_tensors[2]) + iou_threshold = self.get_tensor_value(input_tensors[3]) + score_threshold = self.get_tensor_value(input_tensors[4]) + soft_nms_sigma = self.get_tensor_value(input_tensors[5]) + + if isinstance(max_output_size, np.ndarray): + assert max_output_size.size == 1, "only one value is expected." + max_output_size = int(max_output_size) + + if isinstance(iou_threshold, np.ndarray): + assert iou_threshold.size == 1, "only one value is expected." + iou_threshold = float(iou_threshold) + + if isinstance(score_threshold, np.ndarray): + assert score_threshold.size == 1, "only one value is expected." + score_threshold = float(score_threshold) + + if isinstance(soft_nms_sigma, np.ndarray): + assert soft_nms_sigma.size == 1, "only one value is expected." + soft_nms_sigma = float(soft_nms_sigma) + if soft_nms_sigma != 0.0: + raise tvm.error.OpNotImplemented( + "It is soft_nms when soft_nms_sigma != 0, which is not supported!" + ) + + scores_expand = _op.expand_dims(scores, axis=-1, num_newaxis=1) + data = _op.concatenate([scores_expand, boxes], -1) + data = _op.expand_dims(data, axis=0, num_newaxis=1) + + count, data, indices = _op.vision.get_valid_counts( + data, score_threshold=score_threshold, id_index=-1, score_index=0 + ) + + nms_ret = _op.vision.non_max_suppression( + data=data, + valid_count=count, + indices=indices, + max_output_size=max_output_size, + iou_threshold=iou_threshold, + force_suppress=True, + top_k=-1, + coord_start=1, + score_index=0, + id_index=-1, + return_indices=True, + invalid_to_bottom=False, + ) + + selected_indices = _op.squeeze(nms_ret[0], axis=[0]) + selected_indices = _op.strided_slice(selected_indices, [0], [max_output_size]) + valide_num = _op.squeeze(nms_ret[1], axis=[1]) + selected_scores = _op.take(scores, selected_indices, axis=0) + out = _expr.TupleWrapper(_expr.Tuple([selected_indices, selected_scores, valide_num]), 3) + return out + def convert_expand_dims(self, op): """Convert TFLite EXPAND_DIMS""" input_tensors = self.get_input_tensors(op) diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 23b5a03ffb5f4..c271a669e95cc 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -44,6 +44,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import image_ops from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import nn_impl from tensorflow.python.ops import variables @@ -4937,6 +4938,42 @@ def test_prevent_tensorflow_dynamic_range(): tvm_output = run_tvm_graph(tflite_model, data_array, data_in.name.replace(":0", "")) +def _test_nms_v5( + bx_shape, score_shape, iou_threshold, score_threshold, max_output_size, dtype="float32" +): + """One iteration of nms_v5 with given attributes""" + boxes = np.random.uniform(0, 10, size=bx_shape).astype(dtype) + scores = np.random.uniform(size=score_shape).astype(dtype) + + tf.reset_default_graph() + tf.compat.v1.disable_eager_execution() + in_data_1 = array_ops.placeholder(dtype, boxes.shape, name="in_data_1") + in_data_2 = array_ops.placeholder(dtype, scores.shape, name="in_data_2") + out = image_ops.non_max_suppression_with_scores( + boxes=in_data_1, + scores=in_data_2, + max_output_size=max_output_size, + iou_threshold=iou_threshold, + score_threshold=score_threshold, + name="nms", + ) + + compare_tflite_with_tvm( + [boxes, scores], + ["in_data_1:0", "in_data_2:0"], + [in_data_1, in_data_2], + [out[0], out[1]], + out_names=[out[0].name, out[1].name], + experimental_new_converter=True, + ) + + +def test_forward_nms_v5(): + """test nms_v5""" + _test_nms_v5((10000, 4), (10000,), 0.5, 0.4, 100) + _test_nms_v5((1000, 4), (1000,), 0.7, 0.3, 50) + + ####################################################################### # Main # ---- @@ -5031,6 +5068,9 @@ def test_prevent_tensorflow_dynamic_range(): # Detection_PostProcess test_detection_postprocess() + # NonMaxSuppressionV5 + test_forward_nms_v5() + # Overwrite Converter test_custom_op_converter() From b7e299f4a4f9a90b2538d77bc3ae9da9bbff4ef1 Mon Sep 17 00:00:00 2001 From: Ivy Zhang Date: Tue, 5 Jul 2022 15:41:25 +0800 Subject: [PATCH 051/111] [BYOC-DNNL]rewrite downsize blocks for rensetv1 to get better performance (#11822) * rewrite downsize blocks for rensetv1 to get better performance * fix lint --- python/tvm/relay/op/contrib/dnnl.py | 179 ++++++++++++++++++++++++++++ tests/python/contrib/test_dnnl.py | 100 ++++++++++++++++ 2 files changed, 279 insertions(+) diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py index c251b66bfbc77..b3ef478f201db 100644 --- a/python/tvm/relay/op/contrib/dnnl.py +++ b/python/tvm/relay/op/contrib/dnnl.py @@ -782,6 +782,185 @@ def rewrite_dense_bias_gelu_reshape_last(mod): return mod +class ResNetV1Rewrite(DFPatternCallback): + """ + A callback to advance downsize operation when the patterns are as pattern1, + and the result is written in pattern2: + Pattern #1: + %26 = nn.conv2d(%25, ty=Tensor[(64, 256, 1, 1)); + %27 = add(%26, ty=Tensor[(64, 1, 1)); + %28 = nn.relu(%27); + + %29 = nn.conv2d(%28, ty=Tensor[(64, 64, 3, 3)); + %30 = add(%29, ty=Tensor[(64, 1, 1)); + %31 = nn.relu(%30); + + %32 = nn.conv2d(%31, ty=Tensor[(256, 64, 1, 1)); + %33 = add(%32, ty=Tensor[(256, 1, 1)); + %34 = add(%33, %25); + %35 = nn.relu(%34); + + %36 = nn.conv2d(%35, ty=Tensor[(128, 256, 1, 1), strides=[2, 2]); + %37 = add(%36, ty=Tensor[(128, 1, 1)); + %38 = nn.relu(%37); + + %39 = nn.conv2d(%38, ty=Tensor[(128, 128, 3, 3)); + %40 = add(%39, ty=Tensor[(128, 1, 1)]); + %41 = nn.relu(%40); + + %42 = nn.conv2d(%41, ty=Tensor[(512, 128, 1, 1)); + %43 = nn.conv2d(%35, ty=Tensor[(512, 256, 1, 1), strides=[2, 2]); + %44 = add(%42, ty=Tensor[(512, 1, 1)); + %45 = add(%43, ty=Tensor[(512, 1, 1)); + + %46 = add(%44, %45); + %47 = nn.relu(%46); + Pattern #2: + %26 = nn.conv2d(%25, ty=Tensor[(64, 256, 1, 1)); + %27 = add(%26, ty=Tensor[(64, 1, 1)); + %28 = nn.relu(%27); + + %29 = nn.conv2d(%28, ty=Tensor[(64, 64, 3, 3), strides=[2, 2]); + %30 = add(%29, ty=Tensor[(64, 1, 1)); + %31 = nn.relu(%30); + + %32 = nn.conv2d(%31, ty=Tensor[(256, 64, 1, 1)); + %33 = add(%32, ty=Tensor[(256, 1, 1)); + %34 = nn.max_pool2d(%25, pool_size=[1, 1], strides=[2, 2], padding=[0, 0, 0, 0]); + %35 = add(%33, %34); + %36 = nn.relu(%35); + + %37 = nn.conv2d(%36, ty=Tensor[(128, 256, 1, 1)); + %38 = add(%37, ty=Tensor[(128, 1, 1)); + %39 = nn.relu(%38); + + %40 = nn.conv2d(%39, ty=Tensor[(128, 128, 3, 3)); + %41 = add(%40, ty=Tensor[(128, 1, 1)); + %42 = nn.relu(%41); + + %43 = nn.conv2d(%42, ty=Tensor[(512, 128, 1, 1)); + %44 = nn.conv2d(%36, ty=Tensor[(512, 256, 1, 1)); + %45 = add(%43, ty=Tensor[(512, 1, 1)); + %46 = add(%44, ty=Tensor[(512, 1, 1)); + %47 = add(%45, %46); + %48 = nn.relu(%47); + """ + + def __init__(self): + super(ResNetV1Rewrite, self).__init__() + self.attr_lst = [] + self.data = wildcard() + self.w1, self.b1 = wildcard(), wildcard() + self.w2, self.b2 = wildcard(), wildcard() + self.w3, self.b3 = wildcard(), wildcard() + self.w4, self.b4 = wildcard(), wildcard() + self.w5, self.b5 = wildcard(), wildcard() + self.w6, self.b6 = wildcard(), wildcard() + self.w7, self.b7 = wildcard(), wildcard() + + conv1 = is_op("nn.conv2d")(self.data, self.w1).has_attr({"kernel_size": [1, 1]}) + conv1 = is_op("add")(conv1, self.b1) + conv1 = is_op("nn.relu")(conv1) + + conv2 = is_op("nn.conv2d")(conv1, self.w2).has_attr({"kernel_size": [3, 3]}) + conv2 = is_op("add")(conv2, self.b2) + conv2 = is_op("nn.relu")(conv2) + + conv3 = is_op("nn.conv2d")(conv2, self.w3).has_attr({"kernel_size": [1, 1]}) + conv3 = is_op("add")(conv3, self.b3) + conv3 = is_op("add")(conv3, self.data) + conv3 = is_op("nn.relu")(conv3) + + left_conv4 = is_op("nn.conv2d")(conv3, self.w4).has_attr({"strides": [2, 2]}) + left_conv4 = is_op("add")(left_conv4, self.b4) + left_conv4 = is_op("nn.relu")(left_conv4) + + left_conv5 = is_op("nn.conv2d")(left_conv4, self.w5).has_attr({"kernel_size": [3, 3]}) + left_conv5 = is_op("add")(left_conv5, self.b5) + left_conv5 = is_op("nn.relu")(left_conv5) + + left_conv6 = is_op("nn.conv2d")(left_conv5, self.w6).has_attr({"kernel_size": [1, 1]}) + left_conv6 = is_op("add")(left_conv6, self.b6) + + right_conv7 = is_op("nn.conv2d")(conv3, self.w7).has_attr({"strides": [2, 2]}) + right_conv7 = is_op("add")(right_conv7, self.b7) + + out = is_op("add")(left_conv6, right_conv7) + out = is_op("nn.relu")(out) + self.pattern = out + + def get_attr(self, pre): + """Recursively retrieve attributes from reshape operator.""" + + def visit_func(expr): + if isinstance(expr, _expr.Call) and expr.op == relay.op.get("nn.conv2d"): + self.attr_lst.append(expr.attrs) + + _analysis.post_order_visit(pre, visit_func) + + def callback(self, pre, post, node_map): + self.get_attr(pre) + data = node_map[self.data][0] + w1, b1 = node_map[self.w1][0], node_map[self.b1][0] + w2, b2 = node_map[self.w2][0], node_map[self.b2][0] + w3, b3 = node_map[self.w3][0], node_map[self.b3][0] + w4, b4 = node_map[self.w4][0], node_map[self.b4][0] + w5, b5 = node_map[self.w5][0], node_map[self.b5][0] + w6, b6 = node_map[self.w6][0], node_map[self.b6][0] + w7, b7 = node_map[self.w7][0], node_map[self.b7][0] + + new_attrs = self.attr_lst[-7] + conv1 = relay.op.nn.conv2d(data, w1, **new_attrs) + conv1 = relay.op.add(conv1, b1) + conv1 = relay.op.nn.relu(conv1) + + new_attrs = dict(self.attr_lst[-6]) + new_attrs["strides"] = [2, 2] + conv2 = relay.op.nn.conv2d(conv1, w2, **new_attrs) + conv2 = relay.op.add(conv2, b2) + conv2 = relay.op.nn.relu(conv2) + + new_attrs = self.attr_lst[-5] + conv3 = relay.op.nn.conv2d(conv2, w3, **new_attrs) + conv3 = relay.op.add(conv3, b3) + max_pool = relay.op.nn.max_pool2d( + data, pool_size=(1, 1), strides=(2, 2), layout=new_attrs["data_layout"] + ) + conv3 = relay.op.add(conv3, max_pool) + conv3 = relay.op.nn.relu(conv3) + + new_attrs = dict(self.attr_lst[-4]) + new_attrs["strides"] = [1, 1] + left_conv4 = relay.op.nn.conv2d(conv3, w4, **new_attrs) + left_conv4 = relay.op.add(left_conv4, b4) + left_conv4 = relay.op.nn.relu(left_conv4) + + new_attrs = self.attr_lst[-3] + left_conv5 = relay.op.nn.conv2d(left_conv4, w5, **new_attrs) + left_conv5 = relay.op.add(left_conv5, b5) + left_conv5 = relay.op.nn.relu(left_conv5) + + new_attrs = self.attr_lst[-2] + left_conv6 = relay.op.nn.conv2d(left_conv5, w6, **new_attrs) + left_conv6 = relay.op.add(left_conv6, b6) + + new_attrs = dict(self.attr_lst[-1]) + new_attrs["strides"] = [1, 1] + right_conv7 = relay.op.nn.conv2d(conv3, w7, **new_attrs) + right_conv7 = relay.op.add(right_conv7, b7) + + out = relay.op.add(left_conv6, right_conv7) + out = relay.op.nn.relu(out) + self.attr_lst = [] + return out + + +def rewrite_resnetv1(mod): + """Rewrite the the ResNetV1 downsize block to reduce the computation complexity.""" + mod["main"] = rewrite(ResNetV1Rewrite(), mod["main"]) + return mod + + class LegalizeQnnOpForDnnl(DFPatternCallback): """Legalize QNN based patterns to match DNNL diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py index 2138eda086978..078483798c6dd 100755 --- a/tests/python/contrib/test_dnnl.py +++ b/tests/python/contrib/test_dnnl.py @@ -1128,6 +1128,106 @@ def get_graph(act=None): ) +def test_resnetv1_rewrite(run_module, dtype="float32"): + def get_graph(): + data_shape = (1, 256, 56, 56) + w_shapes = [ + (64, 256, 1, 1), + (64, 64, 3, 3), + (256, 64, 1, 1), + (128, 256, 1, 1), + (128, 128, 3, 3), + (512, 128, 1, 1), + (512, 256, 1, 1), + ] + x = relay.var("x", shape=data_shape, dtype=dtype) + wights = [relay.const(np.random.randint(0, 1, w).astype(dtype)) for w in w_shapes] + biases = [relay.const(np.random.randint(0, 1, w[0]).astype(dtype)) for w in w_shapes] + + conv1 = relay.nn.conv2d( + x, + wights[0], + channels=w_shapes[0][0], + kernel_size=w_shapes[0][2:4], + padding=(w_shapes[0][2] // 2, w_shapes[0][3] // 2), + ) + conv1 = relay.nn.bias_add(conv1, biases[0]) + conv1 = relay.nn.relu(conv1) + + conv2 = relay.nn.conv2d( + conv1, + wights[1], + channels=w_shapes[1][0], + kernel_size=w_shapes[1][2:4], + padding=(w_shapes[1][2] // 2, w_shapes[1][3] // 2), + ) + conv2 = relay.nn.bias_add(conv2, biases[1]) + conv2 = relay.nn.relu(conv2) + + conv3 = relay.nn.conv2d( + conv2, + wights[2], + channels=w_shapes[2][0], + kernel_size=w_shapes[2][2:4], + padding=(w_shapes[2][2] // 2, w_shapes[2][3] // 2), + ) + conv3 = relay.nn.bias_add(conv3, biases[2]) + conv3 = relay.add(conv3, x) + conv3 = relay.nn.relu(conv3) + + left_conv4 = relay.nn.conv2d( + conv3, + wights[3], + channels=w_shapes[3][0], + strides=(2, 2), + kernel_size=w_shapes[3][2:4], + padding=(w_shapes[3][2] // 2, w_shapes[3][3] // 2), + ) + left_conv4 = relay.nn.bias_add(left_conv4, biases[3]) + left_conv4 = relay.nn.relu(left_conv4) + + left_conv5 = relay.nn.conv2d( + left_conv4, + wights[4], + channels=w_shapes[4][0], + kernel_size=w_shapes[4][2:4], + padding=(w_shapes[4][2] // 2, w_shapes[4][3] // 2), + ) + left_conv5 = relay.nn.bias_add(left_conv5, biases[4]) + left_conv5 = relay.nn.relu(left_conv5) + + left_conv6 = relay.nn.conv2d( + left_conv5, + wights[5], + channels=w_shapes[5][0], + kernel_size=w_shapes[5][2:4], + padding=(w_shapes[5][2] // 2, w_shapes[5][3] // 2), + ) + left_conv6 = relay.nn.bias_add(left_conv6, biases[5]) + + right_conv7 = relay.nn.conv2d( + conv3, + wights[6], + channels=w_shapes[6][0], + strides=(2, 2), + kernel_size=w_shapes[6][2:4], + padding=(w_shapes[6][2] // 2, w_shapes[6][3] // 2), + ) + right_conv7 = relay.nn.bias_add(right_conv7, biases[6]) + + out = relay.add(left_conv6, right_conv7) + out = relay.nn.relu(out) + + dic = {"x": data_shape} + param_lst = [] + return out, dic, param_lst + + net, dic, param_lst = get_graph() + net = tvm.IRModule.from_expr(net) + config = net, dic, param_lst + run_and_verify_func(config, run_module=run_module, dtype=dtype) + + def permute_shape(shape, l_from="", l_to=""): res_shape = [] for label in l_to: From 3cca6465ba685921a2f5dbe711d10f5b5ee33d33 Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Tue, 5 Jul 2022 10:36:26 -0700 Subject: [PATCH 052/111] [microTVM] Autotuning performance tests (#11782) * Common autotuning test * Autotuned model evaluation utilities * Bugfixes and more enablement * Working autotune profiling test * Refactoring based on PR comments Bugfixes to get tests passing Refactor to remove tflite model for consistency Black formatting Linting and bugfixes Add Apache license header Use larger chunk size to read files Explicitly specify LRU cache size for compatibility with Python 3.7 Pass platform to microTVM common tests Better comment for runtime bound Stop directory from being removed after session creation * Use the actual Zephyr timing library Use unsigned integer Additional logging Try negation Try 64 bit timer Use Zephyr's timing library Fix linting Enable timing utilities --- .../template_project/microtvm_api_server.py | 1 + .../template_project/src/host_driven/main.c | 52 ++---- python/tvm/micro/testing/__init__.py | 20 +++ python/tvm/micro/testing/aot_test_utils.py | 13 +- python/tvm/micro/testing/evaluation.py | 150 ++++++++++++++++++ python/tvm/micro/testing/utils.py | 19 ++- python/tvm/testing/utils.py | 45 +++++- tests/lint/check_file_type.py | 1 - tests/micro/arduino/test_utils.py | 20 +-- tests/micro/common/conftest.py | 13 +- tests/micro/common/test_autotune.py | 96 +++++++++++ tests/micro/common/test_tvmc.py | 27 +--- tests/micro/testdata/kws/yes_no.tflite | Bin 18712 -> 0 bytes tests/scripts/task_python_microtvm.sh | 4 +- 14 files changed, 373 insertions(+), 88 deletions(-) create mode 100644 python/tvm/micro/testing/__init__.py create mode 100644 python/tvm/micro/testing/evaluation.py create mode 100644 tests/micro/common/test_autotune.py delete mode 100644 tests/micro/testdata/kws/yes_no.tflite diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py index d3559cc5f7fb1..7b9538f6ce03c 100644 --- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py +++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py @@ -393,6 +393,7 @@ def _create_prj_conf(self, project_dir, options): if options["project_type"] == "host_driven": f.write( + "CONFIG_TIMING_FUNCTIONS=y\n" "# For RPC server C++ bindings.\n" "CONFIG_CPLUSPLUS=y\n" "CONFIG_LIB_CPLUSPLUS=y\n" diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c index 623266c0cae05..ff02b3cb1d444 100644 --- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c +++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -144,11 +145,7 @@ tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) { return kTvmErrorNoError; } -#define MILLIS_TIL_EXPIRY 200 -#define TIME_TIL_EXPIRY (K_MSEC(MILLIS_TIL_EXPIRY)) -K_TIMER_DEFINE(g_microtvm_timer, /* expiry func */ NULL, /* stop func */ NULL); - -uint32_t g_microtvm_start_time; +volatile timing_t g_microtvm_start_time, g_microtvm_end_time; int g_microtvm_timer_running = 0; // Called to start system timer. @@ -161,8 +158,7 @@ tvm_crt_error_t TVMPlatformTimerStart() { #ifdef CONFIG_LED gpio_pin_set(led0_pin, LED0_PIN, 1); #endif - k_timer_start(&g_microtvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY); - g_microtvm_start_time = k_cycle_get_32(); + g_microtvm_start_time = timing_counter_get(); g_microtvm_timer_running = 1; return kTvmErrorNoError; } @@ -174,43 +170,14 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { return kTvmErrorSystemErrorMask | 2; } - uint32_t stop_time = k_cycle_get_32(); #ifdef CONFIG_LED gpio_pin_set(led0_pin, LED0_PIN, 0); #endif - // compute how long the work took - uint32_t cycles_spent = stop_time - g_microtvm_start_time; - if (stop_time < g_microtvm_start_time) { - // we rolled over *at least* once, so correct the rollover it was *only* - // once, because we might still use this result - cycles_spent = ~((uint32_t)0) - (g_microtvm_start_time - stop_time); - } - - uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent); - double hw_clock_res_us = ns_spent / 1000.0; - - // need to grab time remaining *before* stopping. when stopped, this function - // always returns 0. - int32_t time_remaining_ms = k_timer_remaining_get(&g_microtvm_timer); - k_timer_stop(&g_microtvm_timer); - // check *after* stopping to prevent extra expiries on the happy path - if (time_remaining_ms < 0) { - TVMLogf("negative time remaining"); - return kTvmErrorSystemErrorMask | 3; - } - uint32_t num_expiries = k_timer_status_get(&g_microtvm_timer); - uint32_t timer_res_ms = ((num_expiries * MILLIS_TIL_EXPIRY) + time_remaining_ms); - double approx_num_cycles = - (double)k_ticks_to_cyc_floor32(1) * (double)k_ms_to_ticks_ceil32(timer_res_ms); - // if we approach the limits of the HW clock datatype (uint32_t), use the - // coarse-grained timer result instead - if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) { - *elapsed_time_seconds = timer_res_ms / 1000.0; - } else { - *elapsed_time_seconds = hw_clock_res_us / 1e6; - } - + g_microtvm_end_time = timing_counter_get(); + uint64_t cycles = timing_cycles_get(&g_microtvm_start_time, &g_microtvm_end_time); + uint64_t ns_spent = timing_cycles_to_ns(cycles); + *elapsed_time_seconds = ns_spent / (double)1e9; g_microtvm_timer_running = 0; return kTvmErrorNoError; } @@ -278,6 +245,11 @@ void main(void) { tvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console))); uart_rx_init(&uart_rx_rbuf, tvm_uart); + // Initialize system timing. We could stop and start it every time, but we'll + // be using it enough we should just keep it enabled. + timing_init(); + timing_start(); + // Initialize microTVM RPC server, which will receive commands from the UART and execute them. microtvm_rpc_server_t server = MicroTVMRpcServerInit(write_serial, NULL); TVMLogf("microTVM Zephyr runtime - running"); diff --git a/python/tvm/micro/testing/__init__.py b/python/tvm/micro/testing/__init__.py new file mode 100644 index 0000000000000..9062f061bda31 --- /dev/null +++ b/python/tvm/micro/testing/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Allows the tools specified below to be imported directly from tvm.micro.testing""" +from .evaluation import tune_model, create_aot_session, evaluate_model_accuracy +from .utils import get_supported_boards, get_target diff --git a/python/tvm/micro/testing/aot_test_utils.py b/python/tvm/micro/testing/aot_test_utils.py index 82ac1ac68e9da..89c08395deb79 100644 --- a/python/tvm/micro/testing/aot_test_utils.py +++ b/python/tvm/micro/testing/aot_test_utils.py @@ -15,17 +15,22 @@ # specific language governing permissions and limitations # under the License. +""" +This file provides utilities for running AOT tests, especially for Corstone. + +""" + import logging import itertools import shutil import pytest -pytest.importorskip("tvm.micro") - import tvm from tvm.testing.aot import AOTTestRunner +pytest.importorskip("tvm.micro") + _LOG = logging.getLogger(__name__) @@ -97,9 +102,9 @@ def parametrize_aot_options(test): valid_combinations, ) - fn = pytest.mark.parametrize( + func = pytest.mark.parametrize( ["interface_api", "use_unpacked_api", "test_runner"], marked_combinations, )(test) - return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn) + return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(func) diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py new file mode 100644 index 0000000000000..c60f0fc4828ec --- /dev/null +++ b/python/tvm/micro/testing/evaluation.py @@ -0,0 +1,150 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Provides high-level functions for instantiating and timing AOT models. Used +by autotuning tests in tests/micro, and may be used for more performance +tests in the future. + +""" + +from io import StringIO +from pathlib import Path +from contextlib import ExitStack +import tempfile + +import tvm + + +def tune_model( + platform, board, target, mod, params, num_trials, tuner_cls=tvm.autotvm.tuner.GATuner +): + """Autotunes a model with microTVM and returns a StringIO with the tuning logs""" + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target) + assert len(tasks) > 0 + assert isinstance(params, dict) + + module_loader = tvm.micro.AutoTvmModuleLoader( + template_project_dir=tvm.micro.get_microtvm_template_projects(platform), + project_options={ + f"{platform}_board": board, + "project_type": "host_driven", + }, + ) + + builder = tvm.autotvm.LocalBuilder( + n_parallel=1, + build_kwargs={"build_option": {"tir.disable_vectorize": True}}, + do_fork=False, + build_func=tvm.micro.autotvm_build_func, + runtime=tvm.relay.backend.Runtime("crt", {"system-lib": True}), + ) + runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader) + measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner) + + results = StringIO() + for task in tasks: + tuner = tuner_cls(task) + + tuner.tune( + n_trial=num_trials, + measure_option=measure_option, + callbacks=[ + tvm.autotvm.callback.log_to_file(results), + tvm.autotvm.callback.progress_bar(num_trials, si_prefix="M"), + ], + si_prefix="M", + ) + assert tuner.best_flops > 1 + + return results + + +def create_aot_session( + platform, + board, + target, + mod, + params, + build_dir=Path(tempfile.mkdtemp()), + tune_logs=None, + use_cmsis_nn=False, +): + """AOT-compiles and uploads a model to a microcontroller, and returns the RPC session""" + + executor = tvm.relay.backend.Executor("aot") + crt_runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True}) + + with ExitStack() as stack: + config = {"tir.disable_vectorize": True} + if use_cmsis_nn: + config["relay.ext.cmsisnn.options"] = {"mcpu": target.mcpu} + stack.enter_context(tvm.transform.PassContext(opt_level=3, config=config)) + if tune_logs is not None: + stack.enter_context(tvm.autotvm.apply_history_best(tune_logs)) + + lowered = tvm.relay.build( + mod, + target=target, + params=params, + runtime=crt_runtime, + executor=executor, + ) + parameter_size = len(tvm.runtime.save_param_dict(lowered.get_params())) + print(f"Model parameter size: {parameter_size}") + + # Once the project has been uploaded, we don't need to keep it + project = tvm.micro.generate_project( + str(tvm.micro.get_microtvm_template_projects(platform)), + lowered, + build_dir / "project", + { + f"{platform}_board": board, + "project_type": "host_driven", + }, + ) + project.build() + project.flash() + + return tvm.micro.Session(project.transport()) + + +# This utility functions was designed ONLY for one input / one output models +# where the outputs are confidences for different classes. +def evaluate_model_accuracy(session, aot_executor, input_data, true_labels, runs_per_sample=1): + """Evaluates an AOT-compiled model's accuracy and runtime over an RPC session. Works well + when used with create_aot_session.""" + + assert aot_executor.get_num_inputs() == 1 + assert aot_executor.get_num_outputs() == 1 + assert runs_per_sample > 0 + + predicted_labels = [] + aot_runtimes = [] + for sample in input_data: + aot_executor.get_input(0).copyfrom(sample) + result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)() + runtime = result.mean + output = aot_executor.get_output(0).numpy() + predicted_labels.append(output.argmax()) + aot_runtimes.append(runtime) + + num_correct = sum(u == v for u, v in zip(true_labels, predicted_labels)) + average_time = sum(aot_runtimes) / len(aot_runtimes) + accuracy = num_correct / len(predicted_labels) + return average_time, accuracy diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py index a48c8dc3230fb..820b649c74ee0 100644 --- a/python/tvm/micro/testing/utils.py +++ b/python/tvm/micro/testing/utils.py @@ -17,9 +17,10 @@ """Defines the test methods used with microTVM.""" -import pathlib +from functools import lru_cache import json import logging +from pathlib import Path import tarfile import time from typing import Union @@ -32,7 +33,19 @@ TIMEOUT_SEC = 10 -def check_tune_log(log_path: Union[pathlib.Path, str]): +@lru_cache(maxsize=None) +def get_supported_boards(platform: str): + template = Path(tvm.micro.get_microtvm_template_projects(platform)) + with open(template / "boards.json") as f: + return json.load(f) + + +def get_target(platform: str, board: str): + model = get_supported_boards(platform)[board]["model"] + return str(tvm.target.target.micro(model)) + + +def check_tune_log(log_path: Union[Path, str]): """Read the tuning log and check each result.""" with open(log_path, "r") as f: lines = f.readlines() @@ -76,7 +89,7 @@ def _read_line(transport, timeout_sec: int) -> str: return data.decode(encoding="utf-8") -def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int: +def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[Path, str]) -> int: """Extract an MLF archive file and read workspace size from metadata file.""" workspace_size = 0 diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index d7c2adaa8606c..47bdab5828b9a 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -67,6 +67,7 @@ def test_something(): import copyreg import ctypes import functools +import hashlib import itertools import logging import os @@ -77,7 +78,7 @@ def test_something(): import time from pathlib import Path -from typing import Optional, Callable, Union, List +from typing import Optional, Callable, Union, List, Tuple import pytest import numpy as np @@ -90,6 +91,7 @@ def test_something(): from tvm.contrib import nvcc, cudnn import tvm.contrib.hexagon._ci_env_check as hexagon +from tvm.driver.tvmc.frontends import load_model from tvm.error import TVMError @@ -1661,6 +1663,47 @@ def install_request_hook(depth: int) -> None: request_hook.init() +def fetch_model_from_url( + url: str, + model_format: str, + sha256: str, +) -> Tuple[tvm.ir.module.IRModule, dict]: + """Testing function to fetch a model from a URL and return it as a Relay + model. Downloaded files are cached for future re-use. + + Parameters + ---------- + url : str + The URL or list of URLs to try downloading the model from. + + model_format: str + The file extension of the model format used. + + sha256 : str + The sha256 hex hash to compare the downloaded model against. + + Returns + ------- + (mod, params) : object + The Relay representation of the downloaded model. + """ + + rel_path = f"model_{sha256}.{model_format}" + file = tvm.contrib.download.download_testdata(url, rel_path, overwrite=False) + + # Check SHA-256 hash + file_hash = hashlib.sha256() + with open(file, "rb") as f: + for block in iter(lambda: f.read(2**24), b""): + file_hash.update(block) + + if file_hash.hexdigest() != sha256: + raise FileNotFoundError("SHA-256 hash for model does not match") + + tvmc_model = load_model(file, model_format) + return tvmc_model.mod, tvmc_model.params + + def main(): test_file = inspect.getsourcefile(sys._getframe(1)) sys.exit(pytest.main([test_file] + sys.argv[1:])) diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index d26b047e81210..37b64433b23ee 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -140,7 +140,6 @@ "tests/micro/testdata/mnist/digit-2.jpg", "tests/micro/testdata/mnist/digit-9.jpg", "tests/micro/testdata/mnist/mnist-8.onnx", - "tests/micro/testdata/kws/yes_no.tflite", # microTVM Zephyr runtime "apps/microtvm/zephyr/template_project/CMakeLists.txt.template", "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-arm", diff --git a/tests/micro/arduino/test_utils.py b/tests/micro/arduino/test_utils.py index c107d5b1febfc..20e7d9e75001c 100644 --- a/tests/micro/arduino/test_utils.py +++ b/tests/micro/arduino/test_utils.py @@ -25,7 +25,7 @@ from tvm.micro import project from tvm import relay from tvm.relay.backend import Executor, Runtime - +from tvm.testing.utils import fetch_model_from_url TEMPLATE_PROJECT_DIR = pathlib.Path(tvm.micro.get_microtvm_template_projects("arduino")) @@ -66,20 +66,12 @@ def make_kws_project(board, arduino_cli_cmd, tvm_debug, workspace_dir): model = ARDUINO_BOARDS[board] build_config = {"debug": tvm_debug} - with open(this_dir.parent / "testdata" / "kws" / "yes_no.tflite", "rb") as f: - tflite_model_buf = f.read() - - # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1 - try: - import tflite.Model - - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) - except AttributeError: - import tflite - - tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) + mod, params = fetch_model_from_url( + url="https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite", + model_format="tflite", + sha256="09e5e2a9dfb2d8ed78802bf18ce297bff54281a66ca18e0c23d69ca14f822a83", + ) - mod, params = relay.frontend.from_tflite(tflite_model) target = tvm.target.target.micro(model) runtime = Runtime("crt") executor = Executor("aot", {"unpacked-api": True}) diff --git a/tests/micro/common/conftest.py b/tests/micro/common/conftest.py index 3fbfdbcbc81d2..10dda8774bca6 100644 --- a/tests/micro/common/conftest.py +++ b/tests/micro/common/conftest.py @@ -21,11 +21,17 @@ def pytest_addoption(parser): + parser.addoption( + "--platform", + required=True, + choices=["arduino", "zephyr"], + help="Platform to run tests with", + ) parser.addoption( "--board", required=True, choices=list(ARDUINO_BOARDS.keys()) + list(ZEPHYR_BOARDS.keys()), - help="microTVM boards for tests.", + help="microTVM boards for tests", ) parser.addoption( "--test-build-only", @@ -34,6 +40,11 @@ def pytest_addoption(parser): ) +@pytest.fixture +def platform(request): + return request.config.getoption("--platform") + + @pytest.fixture def board(request): return request.config.getoption("--board") diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py new file mode 100644 index 0000000000000..37836563a069c --- /dev/null +++ b/tests/micro/common/test_autotune.py @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import StringIO +import json +from pathlib import Path +import sys +import tempfile +from typing import Union + +import numpy as np +import pytest + +import tvm +import tvm.testing +import tvm.micro.testing +from tvm.testing.utils import fetch_model_from_url + +TUNING_RUNS_PER_OPERATOR = 2 + + +@pytest.mark.requires_hardware +@tvm.testing.requires_micro +def test_kws_autotune_workflow(platform, board, tmp_path): + mod, params = fetch_model_from_url( + url="https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite", + model_format="tflite", + sha256="09e5e2a9dfb2d8ed78802bf18ce297bff54281a66ca18e0c23d69ca14f822a83", + ) + target = tvm.micro.testing.get_target(platform, board) + + str_io_logs = tvm.micro.testing.tune_model( + platform, board, target, mod, params, TUNING_RUNS_PER_OPERATOR + ) + assert isinstance(str_io_logs, StringIO) + + str_logs = str_io_logs.getvalue().rstrip().split("\n") + logs = list(map(json.loads, str_logs)) + assert len(logs) == 2 * TUNING_RUNS_PER_OPERATOR # Two operators + + # Check we tested both operators + op_names = list(map(lambda x: x["input"][1], logs)) + assert op_names[0] == op_names[1] == "dense_nopack.x86" + assert op_names[2] == op_names[3] == "dense_pack.x86" + + # Make sure we tested different code. != does deep comparison in Python 3 + assert logs[0]["config"]["index"] != logs[1]["config"]["index"] + assert logs[0]["config"]["entity"] != logs[1]["config"]["entity"] + assert logs[2]["config"]["index"] != logs[3]["config"]["index"] + assert logs[2]["config"]["entity"] != logs[3]["config"]["entity"] + + # Compile the best model with AOT and connect to it + with tvm.micro.testing.create_aot_session( + platform, + board, + target, + mod, + params, + build_dir=tmp_path, + tune_logs=str_io_logs, + ) as session: + aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor()) + + samples = ( + np.random.randint(low=-127, high=128, size=(1, 1960), dtype=np.int8) for x in range(3) + ) + + labels = [0, 0, 0] + + # Validate perforance across random runs + time, acc = tvm.micro.testing.evaluate_model_accuracy( + session, aot_executor, samples, labels, runs_per_sample=20 + ) + # `time` is the average time taken to execute model inference on the + # device, measured in seconds. It does not include the time to upload + # the input data via RPC. On slow boards like the Arduino Due, time + # is around 0.12 (120 ms), so this gives us plenty of buffer. + assert time < 1 + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/micro/common/test_tvmc.py b/tests/micro/common/test_tvmc.py index 24d0213b7754a..096e12393d43f 100644 --- a/tests/micro/common/test_tvmc.py +++ b/tests/micro/common/test_tvmc.py @@ -29,9 +29,6 @@ import tvm.testing from tvm.contrib.download import download_testdata -from ..zephyr.test_utils import ZEPHYR_BOARDS -from ..arduino.test_utils import ARDUINO_BOARDS - TVMC_COMMAND = [sys.executable, "-m", "tvm.driver.tvmc"] MODEL_URL = "https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite" @@ -47,22 +44,8 @@ def _run_tvmc(cmd_args: list, *args, **kwargs): return subprocess.check_call(cmd_args_list, *args, **kwargs) -def _get_target_and_platform(board: str): - if board in ZEPHYR_BOARDS.keys(): - target_model = ZEPHYR_BOARDS[board] - platform = "zephyr" - elif board in ARDUINO_BOARDS.keys(): - target_model = ARDUINO_BOARDS[board] - platform = "arduino" - else: - raise ValueError(f"Board {board} is not supported.") - - target = tvm.target.target.micro(target_model) - return str(target), platform - - @tvm.testing.requires_micro -def test_tvmc_exist(board): +def test_tvmc_exist(platform, board): cmd_result = _run_tvmc(["micro", "-h"]) assert cmd_result == 0 @@ -72,8 +55,8 @@ def test_tvmc_exist(board): "output_dir,", [pathlib.Path("./tvmc_relative_path_test"), pathlib.Path(tempfile.mkdtemp())], ) -def test_tvmc_model_build_only(board, output_dir): - target, platform = _get_target_and_platform(board) +def test_tvmc_model_build_only(platform, board, output_dir): + target = tvm.micro.testing.get_target(platform, board) if not os.path.isabs(output_dir): out_dir_temp = os.path.abspath(output_dir) @@ -138,8 +121,8 @@ def test_tvmc_model_build_only(board, output_dir): "output_dir,", [pathlib.Path("./tvmc_relative_path_test"), pathlib.Path(tempfile.mkdtemp())], ) -def test_tvmc_model_run(board, output_dir): - target, platform = _get_target_and_platform(board) +def test_tvmc_model_run(platform, board, output_dir): + target = tvm.micro.testing.get_target(platform, board) if not os.path.isabs(output_dir): out_dir_temp = os.path.abspath(output_dir) diff --git a/tests/micro/testdata/kws/yes_no.tflite b/tests/micro/testdata/kws/yes_no.tflite deleted file mode 100644 index 4f533dac840504c65ac16ee7f6db8001d3b0d76a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18712 zcmXWDXRvMQSsu7@j(hDK&)FxRn`38nE6}0=A%YPMCQM;dfL+K_%#6ps1~W5Nu5wLL zHf0zyq#1cY1_TlUMI^V>YN@+#-@ftOlf%w&<($_nVehK9zO(mUYn?i4ec$^%@AEwG zSpfjxAHDt~xAGx>6@Vsy00h7Q0#MA~SMyH*0M6q-{2&0_{s;gx^LRObkK_U7!Q}D% z4x;pZ>+)_@6)hKmFRT|MD;X%U}MD|MA!U#i#$__`MGUz~AMu zo?qC806@-Tl*ik7d?Sy``MK+PH1qiW?DOpvl(#ueaYncx7(~NVY!~nbwy-`H`T8|`0@kp;%7#06icuY?!ER|_g(~i z;ul6g)w=!zgBbk5-?Nv&uWab1^ybJ4fNacgElSKP&v#p1hO( z>eC;;`bOuQ;POUA*Rrqv{4YU#=U@G^F9hi)??26`-)mo1zG2fJdDmEc`e)>~fB&yf z4+{V05c|wqlh6Ft`uuYbzFhj}Uk7SGv-9N?-1xmGGWUo7{V&dK=hy!Em-_B6eC2P( z;vZCnVZXGXe>~DXi^yZ(G|B8ZsyLkKuzx|y* zyXXAokALdhe|vPt|4oE~U;Qs??N|TRj|HFgezWy2PiD8ib}*g&TFG<1n9i(!JitCp z-u=q`=imId+r_KZ&u+f|#ovAIa~~Z1_^*8IgB$0_{GS05#hjIU>;LN;#+Q(v0DtVv z*?iCT_ir>FX)yJzy-yC3->`r7y@MOy`t8au{50@)FTVTacW%Ca_?&^cU`W>;3Bw<<|=m3MX(FHy@Mw ze4D&LLwz=sI6`}>^0Q^pQlq4rE6PnC^FygbOhv7?kG*%(53jopWQVHt3Nh-N1~# zRwrgjp@c?e=BLWKEJcJzA)jFd=1T`k-fT={)QMukS+(j8ZkE+fn z9WdN7<=c4s1DFOhQW0Abn%i*mM+fu~-kq_`;&LnOf%TbAw$hgoBB;7KD>D zhOv4;^u`$qP76>Ag;82nV%~y!r3*+AHZ>MgHFqhJwukZHE)AZ=wo!o*JnqKlDGgf@ zvw_qQ7z`uNqmn@jEb&0GyB9QqQJdK zWmFfrVm`HdK9h>{K~lw8oUp$~Q60u^K{Uy&W)+$>l>^F$v2dKgMSfnlkHHCviEOeg zuY~|z@s1A~Aqb=CKBTIUg( zTvc_fBkM$F+7F8-!&8WfP8yHWAl-EU`h(tpL;U0Bn4j5R3G*S3k1D}2c>LIflCm)h zah0+Xb4~A4^+HG6A`M1bOB^^Qld_K!8-(vthJ%yneV>qmmdKx~)>aU7Lv|oVBHYd* z#o%Zjz+AyOkf1XF@r@!;I$X(VEO)@>N8`Q(}VGY^B?5Bdui z4ay~?SM=IVYD=?tOGSJGAEGqSEN$UlNQYrX>N^p$dhH-`lBYJ#C3GA*92ADNB#fnX zW^}_Z*hqC}s}>XD5KlK>Lm>p>ReNh;JK`oiMW48fkPgD~*MW$ogNgfH-vxO^kv?FL zv5Jbfz8Ve|riuWkk0M7-4OTx1SQzH0<9*P?ip|b!#wCnny<(w4*r*bBNn;swLCec@ zW)AMGmSA;W&*#NPsE>>eczWf*d_iNEsq0hH)`gwmmFelodueO!MGE7R%lM^c2n6Dy zxQwf?Q7#}43uhxvCudglp93{&w-WR(#~GL+0ra)g3)Ikz_B*QA1^{1>X9(|R2-%~! zk#3YPJ?&FkkkVZTq=^}rpIU~PoC8m2SHG%Tos74=(IxYmXdm+UK*^s)F56foQcVp$V#r!`0I31x_qTn z`mVp2;z+vv|C;^4&+P8soxm#-i>nQ(P_@Q~fpLzZrZX)cruahT!t7)UD_}y_!wD+P z!H(p)k(8Y%DJ76Q4&rRGOi&IX`|7#VZ$%KjhA)mNi{jf; zK0|xuz)fzRnokK^u51i+)nHTU3MlT4@aU#k#19~0d8q})mM+y#RsdZ`%kbXtEggf} zl-&8tML1mP!i~S>hE{89D17$V0w@ZJzQ{8Uo=NQ!H_&F&sJfKNV?(Ere%RblL3-za zL1-;$#zsW*7fR)M<{(SJIEt$E92MH(3Py@V%MV*l(@9}`bn9KFFJ)ul@>5+waL_3C zq)^1eS{VfJO+ap~p*CK*+){Oh9M;!qtNkEN3#)Z^?K`k7)OT&-%?z>WR^7i3c#@si zwQp(+3&jO#K6S#-tzUhx2NsfeQ@B4>B{kC)09UUd48lw-_v8$*yJFeGW#7bM$e~=Q z)etcsgBDin0iXn$D|g?~m6IMM;xqlRk|oU5Jw@gW9OSt4n9ei)RSD0N#_BxQ+Xsu= z`1hDSIlQ~WU)!Mdy|l3cNbTcz$=sJ**>&%tkFvf(&G{` zxpP{zkPt219`{FfFCcCrgw1CJc@2XSlvy9P9p=oht{fIMyFzRf5$p?&L-HTWw*S_t z9Tr!QyswPYDc(j(-wlc@OAxF+044)3ve>5<;_I-n@g)uIfi6;h3{tBg%7Hp%0k9v! z^m&>^3lef>Qvz^&UJ949s6&(|VIP3cjLmWDC^P-+)?y4Zc;NH>_8AWt*eZ!y@&v(e z8i5H`J5L6beJ|cFza^M4G2i7w=g)C@Ui&EB{Ij`TW-rC!|J^s54nL}X>!`Rn@u`X& z^}82l+TL3paD=PHlRb})9B|_xV$;lO-Awb`!pM-IfhKKv;YP%YB_w863p+sQI#_ox zP=Q3JTTe$LDxNJaD~dKV(WG>Ed`}I5lrweCLEzRU73Br#&HxcE9xh+o9KlxL6xVkz z(!U(1&8<5p+2=qvuGPiPe|8|g-to-O0Hy9o0?4ChP%ZJNS=7@!)v{_|xBz&amAJ{vtLdMW(^I2mn)BjoY`ac~pX1;>rc-G(oY zNLT-G`yrAiS+{ZR9pmd7Uv9sEw*TFP4ZE)n*Zy=k=g}=l{CzY;VKCc#>UjZbYim!= zP$JA=l~Or?tSloCrPc~5*5u`lrc8qw=n~{?Ag_(*`qgjt zwc^}$TBpN_cDVNacnZu# z;CfKBYiFK!%{H^GQu7;T#GrSG2KJ22C z#q^_Br^SeBr-_h=rY_Q8JX1;(^FbF!{WD0KksJg6M)SBbj6so?Or)%q6Zo z-Jeo&mVy`dMKcXN>0Sn7a}Og9EQJ^*j97|F#P(}bmXK%VUwQrAvH`6FFMf2t@Yhq@ zfAN<`%onxJZ0Eo8#kc!uxb@NZ+J7HeQi(OB4|KeS1Bc5u_1M+QAoT5vWR775ad1E4 z;-Xm}uuBpbCeqR=6_;sHd6K7Cgfh7EDY(MKr@?rhwWw6y`1}*F99L%T4~_=LlE`k2 zqa~7hNk`2Tzn3MKD5uo2UM;((YHRwZsQNeJYmxq`Hut?5MvK4nq2R0UDNE>|Tu#0_ z2P*t?>B{|HkDIJe$%RToYA3>;DCsgTBt_tmaOFxM07q3rY9lmfX$PbM<*Z* z+wNJhsTpUUbX3xyCsPz1kf!K-T}OH#CQP)r5ZH}tL;k>}MUMfns)oT7>DE~#z5G#o zeC|LSKl{esWWDWiKlk%`FlsKhfAb&9;R)lcyz%3GWx{Q4-@1eLN1Zr$p^7L9IGqqA zJPkNZCJOaq?IIYon?T_J^j0fU?Yt+K!3M_Yvnq^nWC&YlU_|ZO?(saG7ix3#0$7>? z)BoUovvb6R(d-q;qCwsOvj;Y#^4@=S?a|nmw^G(NL6G6Ufx7 z8?o5RV0eOy3|mIg4NDL(a#q(>GF$he2^JDCU?Tdm*o0S$trT`4G6N({Xw$8x;ZBph zUXr{RSRF2Ljx_uPR%=NXWs3;cDFmSlDc#H5aV#oH2|hbP(u~W(Xso$um!G9}as*Qs z4~-(4L{=2>K1#wKib~OPZCZ_h^lZL(I;(4-LADO#;~FfFErqfrI-f$pVwh$u)i|gu z=K!a!Yb35axZ?;F%!{EYm{OY-2$D3k<*P~4H~EMcV~tD^$Gjpz_pWN619;`8B8qf*|U!j?>Q>OPf^<&7-r1j7}}7U|R74OQphz zIEdNitQDIBsdel&t8F`kPx&mX(u(0jwh%07jt8LzCIHhl))fhdM+7aMI5+&jN%|Z!q%ZoCu1ANRASHynHYk{`hcnynD&aYG6Hu= zmV0EMC6`Qs@x3@GX|*Pp4M6KU7nNpBAS`Q(2-P`AvO##fgOlsQ&_BOoJFh7hxW-fT zO?C!M+&HO;qy;aG2uiRlC1s2$1=zvRQ$RSMr(Ewj_DS{HqQIgrKEvDQsuJ;3_ z#rX38BQG-AazJBYLWP)9{UxBS@}#zGYvyK9F$|p%T3IXKsiI;7?j|xE`rxH@gCy}x zUr~#Z>x)M_c8F^$fq7lkt$j9y?>YzI1kuqBf0p?j6L~HK5?|;?tl|z*BE<|*1x0Q52dFB)wHH?GU0ElAS!1qI)ff$mrwEwgiGJCXxoI7p-I2%u zPeb-KW?>_s7_TkuIkKGQiOO&jD1et|9@0ZZY_k9DfJR3-K;$LOzY7QOrqUL^=sp z3gBu|r{F;Zw9`*+^}QQZfUkR3A#9f%^^KrUs`TAws0cDx47DC5yZGG&VSrTxdd+GH>d782trim zZkBXJg2U7`$lb}ptw_tPv|Pj9b#E>0X<>IbxfzyAmDdcSZn+F2KI%g1!uA76s5l12 zB{z?a!HIdh^ONz&bj=p>L$ZopfOTXmo+L*J2QcNl(1^JlA7BUpDb8quV{mmL4MM?$ zWcX#*%{dRhw4Sge2xLIM-4JjKEx$hT34kOsFkXpev;iB1QMlf8OjMkP(8}V5@bK`K znvpB74yYaVMYH_^)J{!p!16{7n&wDtltz$i*m!+6z=C7D3w@HZV$0p2wn2}apxa*C z%qw-2r^uYjLJNEelJkNE^|RBQ%`a5sT1bXgi^ew=j1wYWdMhnzhzn*Ck%31KV%6KiV83nJ9WHX$>;VMe_K31IvJ)W&1 zpx$(gNr7RY+wLR@f&|!&gN#ZW&h68nBuyLMMnV#lUf~gYj^PdzJEUsh&%-ER=9O&K zzl}c*wg6_e`jH7#H?A4nOR7&S(tHUyKDz`35-LRoZDM(8UxU&S)~UQ<4j>KLEWb(^ zR1$21B{~pJp%t$@Az{NQG^1T2q{?nd$HasS+=5~y9Xceg`x(6yGQ7*;WDf1)OtW%WO`Q z3A`J$y)hBy_@x8^L>h6W?J$zC2z5o5#!7u)&i7l@gyu8TNs5`B4*Vj$#P1B2G}2AE zmbot8mp8mDz;;G>^Tcn3A2UIIBEo6~>oJG(L`?CsMRtBhk=IRd)LULDTv^8YEa(ug zE;%V+n$U*qqLy7H3iBul2&tKG*N{=OS_Uo_Lepil3KeyHaovqmTtSpTV5>lPttP7?#v zLJO{96sIpkmdLPm9;SFMBW}e27ey#un@tO1$z$s7HnB8wt`3%!P&YLLTGPM}hqhgE zoM8KW#eL^|b(G5NWp!Q9TLtbC<|iPbCHzw9Ul`-^kceIPP+yBLIoZ^n?~s?6jG&dB z;>*sm>I&VUvX&>b@LJklCa{h-$*|AzRXEjUT7dEz-doUU+QB#+=z}1RlAc!b3Mh=U z5TSsf08!mHL)G2QSOdeE5w^I>bmRm~g^_0cN)4s?H6=@X#WKJ)X(y_x zS!PukG<4I{_ess?biikP7cYzcBFn%PoXAN{Oks(Vz5zA>CMOHpePk~489X4350VI zhk&4xo+pInQW-xrd1xU7azZXTub&M(!tv&}qHFG#TBW7v?Or8Xc2o+xw=>L5!T@$` z``H2{x`v589y3}2Ci1M?z+bq%I>@1gyW_1>aaK>zEaRv;8I=&&OMy6F!z~U|;3~v4bU6~pvcImfM!bDEP|z(JO}cMCNWB}m7_rQWl)B6 zcV$_|Z(0v@XiuDMbHi2jLHlTJ6AJ?aoY45`>Nkd(Ux+8Evsa=3DVDAM`f&B`E<7LR zTEW#x<%0*maSGt9%vdL}sT1M|35k8yy=D0^ct3ZhA&%U^Pcf$}}kF zOo(UTI_A?kp|p~Kh;wAy<}9x!^g*gLw3RTjp(-9}BZBR78{bQqCar_bJ<(~9(XliP zgrcq|orm@);wVu$jVE3#_305Js;k95K8gtV(w56YITZDaHJ;A%cmZ~*R^&h&3*d2e zgVuuy0j?KAW!xcCLaomvuBh8KG|Ggo44Wf<}8G6~2F zsZ4`W21K|-*rNgvg)y3(vsoCP(WLGK_${=83M-9fLbBgCM$}rsk*`B+&RSyvRcnY^>8Drvh<>}EihBn3*i#e$`PUp z8hJU9AU@RQEJ#RjnuLW95gcPrljc@k=@X|%MjI~}Li8@d!>!XbB%m^ad+!Ab#N{43 zD<1a4eP21rT26&;o+`N$L~^E23MYL@T;qZpry5SJn1PfvcP~z+qM1b7KnaFcaIYpw zE}14PMaPMYTqP|bf!lH$BQ*i$>H~fSg;)&rwTp6PVh9$7oJ3PUpA{?>Zsi(;SkYNl zy*M*U`yQZOovbI%5S<{DF5V{}(bdVT)OZjO(7sgqCcycFdonlCO5*z$0o)a$U`+Hy zAr1lm`j{xr3In7@hqo*_^*8+0Z4EfjF2S+DJWOg7W9BOcYF=!zI{ly&$y;tP9z@CE zwmdiQkr2j&DEN$mBBXMbE)7yBWI%7zQ*0pLmmvq6!O5xlRz~k{AanhQAiuj-a3#^I z>bKKWSwNN3Mq`mKQuFNunX@c;Z?BKEGhW;0pZS~nw`R^u#_WX|F$N*Ttx=Dbbjjn^ zT>H?jtZfv*gCyuQM5^jIeD5kaJ{g$Ba2FUX5-$Gs9l{Od+@(XiV7j0QbhJ1~REx>w zzT6cfNC^)6%$}0swray~RWCojb$AjgLUZ^WwMboUMY-uY*1d#!$O`9pRnYetWSKij z-hO{X*Ww*l#x3R2JcQ&+x_M=$00l{@zoMf^ED7LiCI$uF^{fgkXT@Nnm%-zNSX`yi z>G{Z6ylC6goE=oJQzxib&b+XrjUM$=RaiX`yDOGH0GqYf?d?7@fROoNO*_w&Z(1&y0gPBFF)iq zQw?#7q;xw=IgAs6U3!ul^}9PeD0)~dY|_>2;DT3fc(F`t%#-T@l4_&q@y&6rqE7Mz zLXUc(U9_v-k*h3XEOwM73ODGC&Plv73aHeH@OMSN-%ND>J7VSQA1_V7g~1R%RMPW2 z^*n`wr}XTVD{Dzn)R zQt>Y2!scC^!KVAt@Hz=Nz1;EiJnNz|3XIpa)CUf$#wKmXy%9#&(eJsl^OS+fbWm+X z9%kf^v4OK7rRaZEuk1gU9Lo2M(%ZKTMW2^g^X_@VMc=O=n1w#3)o%;-tfK;tMpFsj zDl8WK!Y-;Q8QnN(G^d*5)iLn05+Na8lP|IHVa#{pE7p6*RGJHf=g`kRlIDBb`S2YN`9 zlI9qxy|dA^&ZB6>U2dcf;Z!)J8K`m}D`dNplA_g(6;fIppxCXAjM5jC{`HWh5P+MP zS6FDK;Io&Fqp+`3W1SqL>0_F6Hj?Oio5;1}MLceoom}o*yuG?LzjSgUkJIM{uWy9% z$f}t?f&tM|5WHP~UR76T&umIrPlGRh!)gK7aH4N0TU%*2fmROD4cW*jev-bP^zV6H zm{@gqFJhwN6?i-%>G}lRp6}D%O8B8zqhVv1_C7)4j&i@U#gWAb) zf8)Hw6~Fm~;ds66vxlCURUKT1HlDSwUQAIi4pmE+?nJ}eo~*B(q5;YR8mg^F^^ zu1(Wd$Jo`0dFoUr;5BtRN-x9S3W^V~HmY2M7e2>yvK4#QZ_3>^nmG5a&##Y)nR#0A zH~Tawz6BX~1qfA)QdsdlWA-#5qAkafupG_@2b^5UEtVUH#uv%vo3|J5E>5`h^N**e zJque&FS*s?2eooqFZrU~J-B~XIxPm!-`^WrY~)(?`HEtTS-# zD!HG#lM2Y7Oi>nXohCxe4SK1b{SDR_U6;Q1oselC{j}KM57n0U+(JjS^|m3*mbJ%K z=UrcRI%f8_PbM686|eMi@8t?!CT6Xt#O-7ALDcC3S57YOqmc*P3TRN;_VIO}huJxn zRs+RhdY8EXiymyvc4`60y}O}YUxKM{+6LM}0L1Tdh=k}+ddN4tzJFdAXcW{tGq~N2 z(h{=uH{-3-kMQ$1XDhefdsUDv7`lu+k5R$6`6|zB46q4oAEfT|S_psneM6)!$L(JB z)>+FELcE=0%UlzmZ5htGlBiN?KIo2vv(`-{+yK;s=GH(-UXAXC3AE&&ql>MxYES506a&5HMQ@L%B6y?j9U60MHAqKu=n+;th-FNO_2yI_ z!mA^spj%4$E|f^(#j=JAOhEW{D`}N&ijW!dl^(4MkK3h>E<+1_$2{6tQRm5nGP8wI zSg=o{Gidfi0L>DKJRPL^&Bl?lxc|P;G+u)C-n^FZNKc5kWpwO zs*U^emj<0@$|R}2v5Fm@hk7GDC%oc2eCr{w$QsWG_Qin$qZCrEJcjFP)0I2k-C$)7lgUHU$k*Y{Dx=6{c!YrkE5;VswRRbhm$ zeaFWhc_}xgpf7wCh^l~`H?5-r=9%^9G zECT%YQD&c#VUu_aQ}*a&<@y2>2b*5G8|3KHjap2Z!UOSqg{UDE&3JbFVZ$$zr(FF~ zXPSb&WN=%y99X6!#63%}A{)jAbq|y|PCiig%(NVSlhxjTTU8{X=KqZ`1T4lq|6XaU zxKna%XA02Q7!BqV!U_VEy6?axB-1xq%u&f!oSWzaPuHrX`5FH(E;poJ1DDU#_ zVdL0axB=%g=sWvZVc72C_Sb za1T*8cF5Fn_NY=$c%{e`T%P}zlq_6GJ?)(gCQ`H38!>sd^U7v`Ra~aMMx>`nMZ9o$ z5GK?HL^qgn9pE>MzHQaYUFuO_pqJWh&B4&=tGk$9DoQJja{;~O6G(;ST~OKHXiCu{ zvYJ6hk1Bp9c=S2~u#88QB_GWV!m3~7l(dmnB%!d{497IV2n|?cQA$U|5|C(6)y}=; zke5iB7_8$Z3SyWk%vZ%X6w0W!E!Vx8Lu)%n=ojPY?*->af%B&jm z))uhw|ZIp`K4DkGX z#V4ynzTX8`=?*Cjs$#C+6bp&y&R6MaI zqH^0RxHu9o2r+FzAx@~CUiMj2*v0AXJ<((08wScd7c~paqDC z=`H*o$w@Xqh7OZe2{t?rVbf|Xu|n6 zBq#SPUe;8ASD#gKe}mDi$lO0tosE!CrJLL#$*W^bu%l5#o=X`N$P_S7PV@R21ATJ9 z6pz!DA(^{!`)FnXL}VnnRg%cS7M)Ob6bCgWHnAz>GI0bl%SLV}Yk9NSWog~8 zOzB5^Xj9H9gp!KOxd)Sjna+%JjvfiI$n+&F&@!N*zg8g+dQ(bv-ElS@AZjyKbyvi|x+GS3s$O+ zk&rlNUIwky` z{A1js%^&gCmKiY=F9*BEoSyR~`Bv_|0l}NY4{wRkRh%4lKVhI-@P>2wB^xNya#~@t zX}XT7`aEo!wcN7!1cVl-Y12;v_5-$Ha{Uf4Eg~6t5+7l~64b)TiKF5maG)dxJqwVh zCSZuNJ{Kt*0oBMXf(6|)^qe~O!C6q*AZ2DMD>}n>)VwkPkp&V55;B1%2jHI0 zq17=leo`)^+|#%-_eIQl=4U!W9r=k<$gsvFP0v-gYWSE1Es$cYPRKwVPU9mjrWn{A zh2Uh-SLDQ{addl3IbnMuHRM5*PO=b zi=P^6n;Cp?qpTT3?$d$|lgQA5sU9ox!eb0_G6C)jEW2V?*W>^g;d@c}2{-{UJ%z1Y zlSI8eDCc+ej+HJUuE8UY-YI(6X)p&f!o+e_2d&59G71YTZg18M_u0!xpM2@*uIed< z$Lcrv_c%aUk8bGpnS*S+v$atcuaegvU%oxjSDn>&Uwh$PFId8?QwlidUudDb%Y*<8IJoeZ(eHXFId2O`p#K|KnF zX;}4ES2BwnXL1F=ic7Ov?#ODtou9A`zsJ0Kc)bk!^Ju$&m%11)iRK48IiIY6JI0EH z$QA0&cj~YA4qhg=A9q)th1+4aXsz{doHq^QIBd*Ipp{fKb=_7<)?RRTng8(K3cnJ^mp^dhMf;RC{Z){ z;$z!KNiOS73E9@bnrJPTO8u>cH* zHx#qQg%4LPGG^$GgL(mwAaNN7E`pGWY)@a}m6b@B5x_UE1V{8w$+tFr%A_|gcF@Pg zfmt})j%6Z&UYzV~umVZlbShUC(Vckv(ayE^F&;6k+fAE5<3!>8xlWc-D9f!pOUAd! z+zWj~C!@MnBo1Sz4?i@-lM*iY@xXA-vnja9-M>^-P6N{7AR`XSD4tuTJVZfG5X3!I zWhb?v5#6g$J?~W9Ps2Os6YOKb9V*J?sC=}>?`anpeYBDJOPg95cbiL_WorHQjd`?^ zXX-oKe}`>Pqxo)KD=~_Ja8y zp&!^0wZB6`Kpd+M8nq3>g5)N{(TF=O5?f$8o`MbC&FraK%^ef_xJ_g&X-a=Y+Xe#N z0!m4}`)#M^lM@aNTo9B_J8Xmybm?UKwnD$Cl!q_ee4AdGdeEKqc-HidkZGA{R^UN- zp~KTsf)c%k-)}3Y;xef($-n|;x%2%zZxI4Ur0GvhHXQH>VM=~rWUyG6+Y-)5m%OuJ z9-g#ZHyF9>k|m~76K)+Hj=aU`=_aIx&&PG`*tnqUy>j^!2|X3sfSibMbEbl@hYjb& z)0L||=lY2BU;cro#M&V_x^sP$;w}oJSuVpRFv$3fPznt{sDYJ_(mD8CC%SkP@bStV}Gy&4C(jE79*Y5tna_!;G+h(wn_bxoYo10%i zI;}*&l1Lb+mA8Q{$ut#uZVu2bQXB<%U#Xx>|HSkc-8lp#Q_~OPa5h`cS;Y6fJlimm z$p@2!uMu7dT7`T|CleYcpm={)%r|B2n4@s*b`Nd;=)9Kok%$w~PH9gLra`=sVJ7JA zR@OzU-idB5Z(Re3%lf9harelmKa+!-Td}PVL#A8_gK?)5W(f<;bIeK}LyIw$!IDk| z^Gt~AmdD$njSAEm>FHw}51|9!10&AI`4ldoq~xC_TfFS$khkDRi_v`E%TD)~xfQoD ze;dn}_3Oc0y25n)r%X{q$6LgAwTm;v7X^J9!7opXjO{k4HSZ3%sbplc?+e;sZO`SB^Cvptt>HrfN|$+P;N3kU1oT5t93rn10AApEa1bu zIm*i*nr)~qnl&p3S^+ZFS`!j9TEGH+;ANIkQJ%hEY&!7AUC;bb-nOg6Ds3AZ!Xp>Z zgt#<5u@*wtyd(xx?EGYTX&T-yr6(Ln(2jR<{ep?1-V6fM#k}AEW%`j z33ZDNsb@}IG~-UnjkE(5Ya@kikHdUbuY+C!i!}j>4#F~quZ2a?CU2pq676YrKZ@zHi0iE@~TLWV9tda6&SGb$u!u%tUop_&H#wxl(MR@` z7@!*Qge-g5x%k&RlAytZC^T{}!i_yg@i_hEj;9{SK=yumO1d4e!Z3KDVL0LM}=!&g#FPD!nZ z(9SfYAaee8nVpu3#v^&E>$lKjlO~=A@#GLY(#xI9>m6ENf#6UV)*1zS#zTo&-t6{p zZ?Y_R>on*ro^*MsMCg$sTJ!9js`9zJIhqx!pzZp4XT9VA%>bmtXJ7%6b8q+}$h@c| zkHIAx2e<|k^5=9ip@kyk0 zL^h<7w10+H9!kZ6W;gnW8$oD_Npuj_v7OC@ZC#v|I(iV2=+MDqn05+_h^SL1rDGJG z?9ib=1n*uFqQ0LuJL4LG2cP`rz4x2<`}=-x7KHcZZu@IpXjcjSnb*Db51O~?U+jC- z5B7EUHFu8{g!fSLGVSKxJ2I`h&ps-dAyK~w9<(lcA`-lWnSYi(4Q((dz<zWLI z@{&X?TSU)2dCQ6mr6NGAhzYYIZg^G=?+=K;r$GLr{0n8!Ew~6PCW42K20hUdlf|-* zDn~r9C_uts4Si^-yk-}AAh0jc#TfnAgcr29)f7oaKGw4~w1J;QR3k~^LLO4sTQCH$ z0Y9_@CI-APC)lnPozCd<}Nc&{2P+B{@64KhcuAnzUC|65jIOCYSn8C&4<&=EnU} z`o5RDJ6iHP#yvN)nFI2Q9w@2WS7#6E!Cth9i2IcIm3b4(H1hI(V5{CD)%Lw#ZHY|MdXop*f=Z-8}o*?Zx`s-(#NmR zZ#C5~oM<$i>A ziFrGlmHBz0!_h9C60(PRhY6Ivt!gyC6AiVBmM>h5sS^!s Date: Tue, 5 Jul 2022 17:11:57 -0700 Subject: [PATCH 053/111] add aten::randn (#11994) --- python/tvm/relay/frontend/pytorch.py | 9 +++++++++ tests/python/frontend/pytorch/test_forward.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index cb5392fa16abe..b1a760886037d 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -2504,6 +2504,14 @@ def empty_like(self, inputs, input_types): dtype = input_types[0] return _op.zeros(shape, dtype) + def randn(self, inputs, input_types): + import time # use current time as seed + + shape = inputs[0] + output = _op.random.normal(_op.random.threefry_key(int(time.time())), shape) + _, values = _expr.TupleWrapper(output, 2) + return values + def bincount(self, inputs, input_types): data = inputs[0] weights = inputs[1] @@ -3415,6 +3423,7 @@ def create_convert_map(self): "aten::numel": self.numel, "aten::empty": self.empty, "aten::empty_like": self.empty_like, + "aten::randn": self.randn, "aten::bincount": self.bincount, "aten::scatter_add": self.scatter_add, "aten::__not__": self.logical_not, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 80a5cd07f7b61..30ba713396572 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -3895,6 +3895,18 @@ def test_func(data): verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()], assert_shape_only=True) +def test_randn(): + def test_func(): + return torch.randn([1, 3, 10, 10]) + + verify_model_with_input(test_func, [], assert_shape_only=True) + + def test_func1(): + return torch.randn(1, 3, 10, 10) + + verify_model_with_input(test_func1, [], assert_shape_only=True) + + def test_forward_pretrained_bert_base_uncased(): ###################################################################### # This is an example how to run BERT models using TVM From ffd8c9377f09c7da3ddb0f673d8819535b409368 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 5 Jul 2022 20:49:55 -0500 Subject: [PATCH 054/111] [TIR] Make conversion from Integer to int64_t explicit (#12010) * [TIR] Make conversion from Integer to int64_t explicit * Fix compilation errors * Fix compilation issues in cpptest * Fix SPIRV compilation errors --- include/tvm/ir/attrs.h | 2 +- include/tvm/ir/expr.h | 2 +- include/tvm/relay/feature.h | 2 +- include/tvm/topi/cuda/injective.h | 2 +- include/tvm/topi/cuda/pooling.h | 2 +- include/tvm/topi/cuda/reduction.h | 2 +- include/tvm/topi/detail/strided_slice.h | 16 ++--- include/tvm/topi/transform.h | 4 +- src/auto_scheduler/transform_step.cc | 36 +++++----- src/contrib/ethosu/cascader/parts/ethosu.cc | 5 +- src/meta_schedule/arg_info.cc | 5 +- src/meta_schedule/database/json_database.cc | 2 +- .../postproc/rewrite_unbound_block.cc | 2 +- src/meta_schedule/postproc/verify_gpu_code.cc | 2 +- src/meta_schedule/schedule_rule/auto_bind.cc | 2 +- .../schedule_rule/multi_level_tiling.cc | 4 +- src/meta_schedule/utils.h | 2 +- src/parser/parser.cc | 2 +- src/parser/token.h | 4 +- .../analysis/extract_fake_quantized_ops.cc | 2 +- src/relay/analysis/extract_operators.cc | 2 +- src/relay/backend/build_module.cc | 4 +- .../backend/contrib/ethosu/source_module.cc | 2 +- src/relay/backend/contrib/tensorrt/codegen.cc | 4 +- src/relay/backend/utils.cc | 4 +- src/relay/ir/expr.cc | 2 +- src/relay/op/tensor/transform.cc | 66 ++++++++++--------- src/relay/op/vision/yolo.cc | 2 +- src/relay/qnn/op/requantize.cc | 2 +- src/relay/transforms/fuse_ops.cc | 3 +- src/relay/transforms/simplify_expr.cc | 9 ++- src/target/build_common.h | 2 +- src/target/llvm/llvm_common.cc | 2 +- src/target/metadata.h | 2 +- src/target/metadata_module.cc | 2 +- src/target/source/codegen_metal.cc | 2 +- src/target/source/interface_c.cc | 4 +- src/target/source/source_module.cc | 8 +-- src/target/spirv/build_vulkan.cc | 5 +- src/target/spirv/spirv_support.cc | 18 +++-- src/target/target.cc | 2 +- src/tir/analysis/calculate_workspace.cc | 3 +- src/tir/contrib/ethosu/passes.cc | 2 +- .../schedule/primitive/cache_read_write.cc | 2 +- .../primitive/layout_transformation.cc | 4 +- src/tir/schedule/primitive/sampling.cc | 2 +- src/tir/schedule/transform.cc | 2 +- .../transforms/inject_software_pipeline.cc | 2 +- src/tir/transforms/lower_thread_allreduce.cc | 3 +- src/tir/transforms/lower_warp_memory.cc | 2 +- src/tir/usmp/algo/greedy.cc | 7 +- src/tir/usmp/algo/hill_climb.cc | 13 ++-- src/tir/usmp/analysis/extract_buffer_info.cc | 12 ++-- .../convert_pool_allocations_to_offsets.cc | 4 +- src/tir/usmp/utils.cc | 4 +- tests/cpp/container_test.cc | 14 +++- 56 files changed, 179 insertions(+), 144 deletions(-) diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h index d2eda659a5d1e..35afed7dd267f 100644 --- a/include/tvm/ir/attrs.h +++ b/include/tvm/ir/attrs.h @@ -296,7 +296,7 @@ class DictAttrs : public Attrs { * \endcode */ bool HasNonzeroAttr(const std::string& attr_key) const { - return GetAttr(attr_key, 0) != 0; + return GetAttr(attr_key, 0).value_or(0).IntValue() != 0; } TVM_DEFINE_OBJECT_REF_METHODS(DictAttrs, Attrs, DictAttrsNode); diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h index b54a067e1c941..b2cfc295b6b52 100644 --- a/include/tvm/ir/expr.h +++ b/include/tvm/ir/expr.h @@ -438,7 +438,7 @@ class Integer : public IntImm { /*! * \brief convert to int64_t */ - operator int64_t() const { + int64_t IntValue() const { ICHECK(data_ != nullptr) << " Trying to reference a null Integer"; return (*this)->value; } diff --git a/include/tvm/relay/feature.h b/include/tvm/relay/feature.h index 751593f94cc0b..136dcfa87c682 100644 --- a/include/tvm/relay/feature.h +++ b/include/tvm/relay/feature.h @@ -68,7 +68,7 @@ class FeatureSet { explicit FeatureSet(Feature ft) { bs_.set(static_cast(ft)); } explicit FeatureSet(const tvm::Array& ft) { for (Integer i : ft) { - (*this) += Feature(static_cast(i)); + *this += Feature(i.IntValue()); } } explicit operator Array() const { diff --git a/include/tvm/topi/cuda/injective.h b/include/tvm/topi/cuda/injective.h index 010fa2ce85671..79ec338aae0e5 100644 --- a/include/tvm/topi/cuda/injective.h +++ b/include/tvm/topi/cuda/injective.h @@ -48,7 +48,7 @@ namespace cuda { inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) { auto fused = detail::Fuse(sch[out], sch[out]->op.as()->axis); auto target = Target::Current(false); - int num_thread = target->GetAttr("max_num_threads").value(); + int num_thread = target->GetAttr("max_num_threads").value().IntValue(); IterVar bx, tx; sch[out].split(fused, num_thread, &bx, &tx); sch[out].bind(bx, thread_axis(Range(), "blockIdx.x")); diff --git a/include/tvm/topi/cuda/pooling.h b/include/tvm/topi/cuda/pooling.h index 0bb9df4a35d18..92be03123602d 100644 --- a/include/tvm/topi/cuda/pooling.h +++ b/include/tvm/topi/cuda/pooling.h @@ -57,7 +57,7 @@ inline Schedule schedule_pool(const Target& target, const Array& outs) { if (padded_input->op->IsInstance()) { s[padded_input].compute_inline(); } - int num_thread = target->GetAttr("max_num_threads").value(); + int num_thread = target->GetAttr("max_num_threads").value().IntValue(); Tensor out; Tensor OL; if (detail::contains(s->outputs, pool->op)) { diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h index 51f35ed8dc25c..b1905d844250a 100644 --- a/include/tvm/topi/cuda/reduction.h +++ b/include/tvm/topi/cuda/reduction.h @@ -80,7 +80,7 @@ Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch, thread_y = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.y"); } else { all_reduce = true; - num_thread = target->GetAttr("max_num_threads").value(); + num_thread = target->GetAttr("max_num_threads").value().IntValue(); thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x"); } diff --git a/include/tvm/topi/detail/strided_slice.h b/include/tvm/topi/detail/strided_slice.h index da76022c552bc..a69f8f99ae38b 100644 --- a/include/tvm/topi/detail/strided_slice.h +++ b/include/tvm/topi/detail/strided_slice.h @@ -95,12 +95,12 @@ inline Array StridedSliceCanonicalizeBegin(const Array& isha std::string slice_mode = "end") { Array begin_expr; for (size_t i = 0; i < axes.size(); ++i) { - if (ishape[axes[i]]->IsInstance()) { - int64_t dim_i = GetConstInt(ishape[axes[i]]); + if (ishape[axes[i].IntValue()]->IsInstance()) { + int64_t dim_i = GetConstInt(ishape[axes[i].IntValue()]); int64_t begin_i = CanonicalizeIndex(begin[i], dim_i, strides[i]); begin_expr.push_back(make_const(dtype, begin_i)); } else { - auto idim = ishape[axes[i]]; + auto idim = ishape[axes[i].IntValue()]; auto b_expr = make_const(dtype, begin[i]); PrimExpr b = begin[i] < 0 ? b_expr + idim : b_expr; auto s = strides[i]; @@ -129,8 +129,8 @@ inline Array StridedSliceOutputShape(const Array& ishape, } for (size_t i = 0; i < axes.size(); ++i) { - if (ishape[axes[i]]->IsInstance()) { - const int64_t dim_i = GetConstInt(ishape[axes[i]]); + if (ishape[axes[i].IntValue()]->IsInstance()) { + const int64_t dim_i = GetConstInt(ishape[axes[i].IntValue()]); ICHECK(begin_canonicalized[i]->IsInstance()); int64_t begin_i = GetConstInt(begin_canonicalized[i]); int64_t end_i = CanonicalizeIndex(end[i], dim_i, strides[i]); @@ -139,11 +139,11 @@ inline Array StridedSliceOutputShape(const Array& ishape, static_cast((interval + std::abs(strides[i]) - 1) / std::abs(strides[i])); ICHECK(strides[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i)) << ": Input [Begin=" << begin[i] << ", End=" << end[i] << "] is invalid for axis=" << i; - out_shape.Set(axes[i], cast(out_shape[i].dtype(), PrimExpr(slice_size))); + out_shape.Set(axes[i].IntValue(), cast(out_shape[i].dtype(), PrimExpr(slice_size))); } else if (use_any) { - out_shape.Set(axes[i], tvm::tir::Any()); + out_shape.Set(axes[i].IntValue(), tvm::tir::Any()); } else { - out_shape.Set(axes[i], tvm::tir::Var("dim", out_shape[i]->dtype)); + out_shape.Set(axes[i].IntValue(), tvm::tir::Var("dim", out_shape[i]->dtype)); } } diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index 75070e119f1f6..86a8856469519 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -790,8 +790,8 @@ inline Tensor strided_slice_with_axes(const Tensor& x, const Array& beg for (size_t i = 0; i < out_shape.size(); ++i) real_indices.push_back(indices[i]); for (size_t i = 0; i < axes.size(); ++i) { auto stride = make_const(strides[i].dtype(), strides_vec[i]); - PrimExpr ind = indices[axes[i]] * stride + begin_expr[i]; - real_indices.Set(axes[i], ind); + PrimExpr ind = indices[axes[i].IntValue()] * stride + begin_expr[i]; + real_indices.Set(axes[i].IntValue(), ind); } return x(real_indices); }, diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc index b67d5cdd7bd93..b821cf892aa78 100644 --- a/src/auto_scheduler/transform_step.cc +++ b/src/auto_scheduler/transform_step.cc @@ -501,10 +501,9 @@ Iterator FuseStepNode::ApplyToState(State* state) const { if (i > 0) { ICHECK_EQ(fused_ids[i]->value, fused_ids[i - 1]->value + 1); } - if (i != fused_ids.size() - 1) { const auto& iter_to_attached_stage = (*state)->attach_map->iter_to_attached_stages; - if (iter_to_attached_stage.find(std::make_pair(stage_id, fused_ids[i])) != + if (iter_to_attached_stage.find(std::make_pair(stage_id, fused_ids[i].IntValue())) != iter_to_attached_stage.end()) { LOG(FATAL) << "Invalid Fuse. Trying to fuse iterators that have been attached by some " << "stages. State before fusion:\n" @@ -512,7 +511,7 @@ Iterator FuseStepNode::ApplyToState(State* state) const { } } - const Iterator& it = stage->iters[fused_ids[i]]; + const Iterator& it = stage->iters[fused_ids[i].IntValue()]; orig_iters.push_back(it); new_name = new_name + it->name + "@"; @@ -543,9 +542,9 @@ Iterator FuseStepNode::ApplyToState(State* state) const { new_iters.push_back(new_it); } else { new_iters.insert(new_iters.end(), stage->iters.begin(), - stage->iters.begin() + fused_ids.front()); + stage->iters.begin() + fused_ids.front().IntValue()); new_iters.push_back(new_it); - new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back() + 1, + new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back().IntValue() + 1, stage->iters.end()); } @@ -561,7 +560,7 @@ Iterator FuseStepNode::ApplyToState(State* state) const { // The original iterators in AttachMap will be updated with the new iterators std::vector from_iters; std::vector to_iters; - const size_t begin_id = fused_ids.front(), end_id = fused_ids.back(); + const size_t begin_id = fused_ids.front().IntValue(), end_id = fused_ids.back().IntValue(); for (size_t i = 0; i < old_iter_size; ++i) { if (i <= begin_id) { continue; @@ -587,7 +586,7 @@ IterVar FuseStepNode::ApplyToSchedule(Array* stages, Array to_fuse; for (const auto& i : fused_ids) { - to_fuse.push_back(axes[i]); + to_fuse.push_back(axes[i.IntValue()]); } IterVar fused_axis; stage.fuse(to_fuse, &fused_axis); @@ -596,9 +595,9 @@ IterVar FuseStepNode::ApplyToSchedule(Array* stages, if (fused_ids.empty()) { new_axes.push_back(fused_axis); } else { - new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front()); + new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front().IntValue()); new_axes.push_back(fused_axis); - new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back() + 1, axes.end()); + new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back().IntValue() + 1, axes.end()); } stage_to_axes->Set(stage, std::move(new_axes)); @@ -613,7 +612,8 @@ String FuseStepNode::PrintAsPythonAPI(Array* stages, std::stringstream to_fuse; for (size_t i = 0; i < fused_ids.size(); ++i) { - to_fuse << CleanName(stage_to_axes->at(stage)[fused_ids[i]]->var->name_hint, op_name); + to_fuse << CleanName(stage_to_axes->at(stage)[fused_ids[i].IntValue()]->var->name_hint, + op_name); if (i != fused_ids.size() - 1) { to_fuse << ", "; } @@ -773,7 +773,7 @@ void ReorderStepNode::ApplyToState(State* state) const { const Stage& stage = (*state)->stages[stage_id]; Array iters; for (auto x : after_ids) { - iters.push_back(stage->iters[x]); + iters.push_back(stage->iters[x.IntValue()]); } state->CopyOnWrite()->stages.Set( stage_id, Stage(stage->op, stage->op_type, iters, stage->compute_at, stage->attrs)); @@ -788,7 +788,7 @@ void ReorderStepNode::ApplyToSchedule(Array* stages, Array new_axes; new_axes.reserve(axes.size()); for (auto i : after_ids) { - new_axes.push_back(axes[i]); + new_axes.push_back(axes[i.IntValue()]); } stage.reorder(new_axes); @@ -804,7 +804,7 @@ String ReorderStepNode::PrintAsPythonAPI(Array* stages, ss << "s[" << op_name << "].reorder("; for (size_t i = 0; i < after_ids.size(); ++i) { - ss << CleanName((*stage_to_axes)[stage][after_ids[i]]->var->name_hint, op_name); + ss << CleanName((*stage_to_axes)[stage][after_ids[i].IntValue()]->var->name_hint, op_name); if (i != after_ids.size() - 1) { ss << ", "; } @@ -1180,10 +1180,10 @@ Optional FollowFusedSplitStepNode::ExtractSplitLength( const Array& transform_steps) const { PrimExpr ret(1); - for (int src_step_id : src_step_ids) { + for (auto src_step_id : src_step_ids) { // Make sure the src_step_id is within the range of transform_steps. - ICHECK_LT(src_step_id, transform_steps.size()); - auto ps = transform_steps[src_step_id].as(); + ICHECK_LT(src_step_id.IntValue(), transform_steps.size()); + auto ps = transform_steps[src_step_id.IntValue()].as(); ICHECK(ps != nullptr); // Multiple the splitting factor on corresponding splitting level of src_steps. if (ps->lengths[level] && ret.defined()) { @@ -1572,7 +1572,7 @@ te::Tensor CacheReadStepNode::ApplyToSchedule(Array* stages, const te::Stage& stage = (*stages)[stage_id]; Array readers; for (const auto& i : reader_stage_ids) { - readers.push_back((*stages)[i]->origin_op); + readers.push_back((*stages)[i.IntValue()]->origin_op); } auto out = schedule->cache_read(stage->origin_op.output(0), scope_name, readers); @@ -1591,7 +1591,7 @@ String CacheReadStepNode::PrintAsPythonAPI(Array* stages, StageToAxes auto stage = (*stages)[stage_id]; Array reader_stages; for (size_t i = 0; i < reader_stage_ids.size(); ++i) { - reader_stages.push_back((*stages)[reader_stage_ids[i]]); + reader_stages.push_back((*stages)[reader_stage_ids[i].IntValue()]); } auto out = ApplyToSchedule(stages, stage_to_axes, schedule); diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc index 33d9b3b452dff..4fb6dbd052033 100644 --- a/src/contrib/ethosu/cascader/parts/ethosu.cc +++ b/src/contrib/ethosu/cascader/parts/ethosu.cc @@ -181,7 +181,10 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.EthosuPart") Array valid_block_configs, int weight_tensor_idx) { std::vector vsubgraph_inputs(subgraph_inputs.begin(), subgraph_inputs.end()); std::vector vpropagators(propagators.begin(), propagators.end()); - std::vector voutput_quantum(output_quantum.begin(), output_quantum.end()); + std::vector voutput_quantum; + std::transform(output_quantum.begin(), output_quantum.end(), + std::back_inserter(voutput_quantum), + [](auto&& val) { return val.IntValue(); }); TESubgraph subgraph; subgraph.input_tensors = vsubgraph_inputs; subgraph.output_tensor = subgraph_output; diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc index 21de9d719d00d..84d861cb59c3d 100644 --- a/src/meta_schedule/arg_info.cc +++ b/src/meta_schedule/arg_info.cc @@ -142,7 +142,10 @@ TensorInfo TensorInfo::FromJSON(const ObjectRef& json_obj) { LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj << "\nThe error is: " << e.what(); } - return TensorInfo(DataType(dtype), ShapeTuple(shape.begin(), shape.end())); + std::vector s; + std::transform(shape.begin(), shape.end(), std::back_inserter(s), + [](Integer i) { return i.IntValue(); }); + return TensorInfo(DataType(dtype), ShapeTuple(s.begin(), s.end())); } /******** Repr ********/ diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index 5e7c9119c95ac..a55ffa8b283af 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -198,7 +198,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record, try { const ArrayNode* arr = json_obj.as(); ICHECK_EQ(arr->size(), 2); - workload = workloads[Downcast(arr->at(0))]; + workload = workloads[Downcast(arr->at(0)).IntValue()]; records[task_id] = TuningRecord::FromJSON(arr->at(1), workload); } catch (std::runtime_error& e) { LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1) diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc index 183f04e7ba239..eb57e90f82f66 100644 --- a/src/meta_schedule/postproc/rewrite_unbound_block.cc +++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc @@ -91,7 +91,7 @@ class RewriteUnboundBlockNode : public PostprocNode { context->target.value()->GetAttr("max_threads_per_block"); CHECK(max_threads_per_block.defined()) << "ValueError: missing attribute `max_threads_per_block` in the target"; - this->max_threads_per_block_ = max_threads_per_block.value(); + this->max_threads_per_block_ = max_threads_per_block.value().IntValue(); } // Inherited from PostprocNode diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc index 57e58e6a79ffb..857b732c98047 100644 --- a/src/meta_schedule/postproc/verify_gpu_code.cc +++ b/src/meta_schedule/postproc/verify_gpu_code.cc @@ -125,7 +125,7 @@ class VerifyGPUCodeNode : public PostprocNode { {"max_vthread", Integer(8)}, {"max_vector_bytes", Integer(16)}, }; - thread_warp_size_ = Extract(target, "thread_warp_size"); + thread_warp_size_ = Extract(target, "thread_warp_size").IntValue(); } bool Verify(const IRModule& mod) const { diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc index 2bc90f3c2e5cf..a67432ebc5da6 100644 --- a/src/meta_schedule/schedule_rule/auto_bind.cc +++ b/src/meta_schedule/schedule_rule/auto_bind.cc @@ -168,7 +168,7 @@ class AutoBindNode : public ScheduleRuleNode { context->target.value()->GetAttr("max_threads_per_block"); CHECK(max_threads_per_block.defined()) << "ValueError: missing attribute `max_threads_per_block` in the target"; - this->max_threads_per_block_ = max_threads_per_block.value(); + this->max_threads_per_block_ = max_threads_per_block.value().IntValue(); } // Inherited from ScheduleRuleNode diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc index 28c1a0fdb66e2..2f2eb219e8c79 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc @@ -118,7 +118,9 @@ std::vector MultiLevelTilingNode::AddWriteReuse(State state) const { if (Optional> ann = tir::GetAnn>( state->sch->GetSRef(state->block_rv), "meta_schedule.write_cache_level")) { req = ReuseType::kMustReuse; - levels = std::vector(ann.value().begin(), ann.value().end()); + levels.clear(); + std::transform(ann.value().begin(), ann.value().end(), std::back_inserter(levels), + [](auto&& v) { return v.IntValue(); }); } std::vector results; if (req == ReuseType::kMayReuse) { diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index b5cb73c26e001..e3d726652e0b2 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -328,7 +328,7 @@ struct ThreadedTraceApply { * \return The number of cores. */ inline int GetTargetNumCores(const Target& target) { - int num_cores = target->GetAttr("num-cores").value_or(-1); + int num_cores = target->GetAttr("num-cores").value_or(-1).IntValue(); if (num_cores == -1) { static const auto* f_cpu_count = runtime::Registry::Get("meta_schedule.cpu_count"); ICHECK(f_cpu_count) diff --git a/src/parser/parser.cc b/src/parser/parser.cc index f51e3e5c9737f..cd208eea5d5c9 100644 --- a/src/parser/parser.cc +++ b/src/parser/parser.cc @@ -1540,7 +1540,7 @@ class Parser { } case TokenType::kBoolean: { Consume(TokenType::kBoolean); - int64_t value = Downcast(next->data); + int64_t value = Downcast(next->data).IntValue(); Expr e = Constant(support::BoolToNDArray(value), next->span); ICHECK(e->span.defined()) << "constant spans must be defined"; return e; diff --git a/src/parser/token.h b/src/parser/token.h index 31e974355e4b8..14e553d358f4f 100644 --- a/src/parser/token.h +++ b/src/parser/token.h @@ -387,7 +387,9 @@ Token::Token(Span span, TokenType token_type, ObjectRef data) { Token Token::Null() { return Token(Span(SourceName(), 0, 0, 0, 0), TokenType::kNull); } -int64_t Token::ToNumber() const { return Downcast(this->operator->()->data); } +int64_t Token::ToNumber() const { + return Downcast(this->operator->()->data).IntValue(); +} std::string Token::ToString() const { return Downcast(this->operator->()->data); } diff --git a/src/relay/analysis/extract_fake_quantized_ops.cc b/src/relay/analysis/extract_fake_quantized_ops.cc index 68cee85f4305c..d66bbd635480b 100644 --- a/src/relay/analysis/extract_fake_quantized_ops.cc +++ b/src/relay/analysis/extract_fake_quantized_ops.cc @@ -55,7 +55,7 @@ class ExtractFakeQuantizedOpsWrapper : private MixedModeVisitor { if (op != dequantize_op_) { if (fake_quantized_op_freqs_.find(op->name) != fake_quantized_op_freqs_.end()) { fake_quantized_op_freqs_.Set(op->name, - int64_t(fake_quantized_op_freqs_.at(op->name)) + 1); + fake_quantized_op_freqs_.at(op->name).IntValue() + 1); } else { fake_quantized_op_freqs_.Set(op->name, 1); } diff --git a/src/relay/analysis/extract_operators.cc b/src/relay/analysis/extract_operators.cc index f150453ba0b66..051c1971f20e3 100644 --- a/src/relay/analysis/extract_operators.cc +++ b/src/relay/analysis/extract_operators.cc @@ -54,7 +54,7 @@ class OperatorExtractorWrapper : private MixedModeVisitor { auto it = operator_freqs_.find(op->name); ICHECK(it != operator_freqs_.end()) << "Call's OpNode must be visited and registered before access"; - operator_freqs_.Set(op->name, 1 + operator_freqs_.at(op->name)); + operator_freqs_.Set(op->name, 1 + operator_freqs_.at(op->name).IntValue()); } MixedModeVisitor::VisitExpr_(n); diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index 9a68b567305d1..39f2e7761a428 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -334,7 +334,9 @@ class RelayBuildModule : public runtime::ModuleNode { if (config_->optional_homogeneous_target.defined()) { // This pass currently only supports the homogeneous case. pass_seqs.push_back(transform::SplitArgs( - config_->optional_homogeneous_target->GetAttr("max_function_args", -1).value())); + config_->optional_homogeneous_target->GetAttr("max_function_args", -1) + .value() + .IntValue())); } // Always plan devices so the remaining passes don't need to distinguish homogeneous vs diff --git a/src/relay/backend/contrib/ethosu/source_module.cc b/src/relay/backend/contrib/ethosu/source_module.cc index eb4b779ecd815..f66ebd5ed2b29 100644 --- a/src/relay/backend/contrib/ethosu/source_module.cc +++ b/src/relay/backend/contrib/ethosu/source_module.cc @@ -199,7 +199,7 @@ class EthosUModuleNode : public ModuleNode { std::unordered_map param_idx_to_base_address; for (const relay::contrib::ethosu::BaseAddress& base_address : artifact->base_addresses) { if (base_address->primfunc_param_idx.defined()) { - param_idx_to_base_address[base_address->primfunc_param_idx] = base_address; + param_idx_to_base_address[base_address->primfunc_param_idx.IntValue()] = base_address; } } for (unsigned int i = 0; i < param_idx_to_base_address.size(); i++) { diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index 1c4a8d78062e7..f4babad50a3ec 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -291,8 +291,8 @@ class TensorRTJSONSerializer : public JSONSerializer { } ICHECK_EQ(target_attr.size(), 3); SetAttr(node, "tensorrt_version", - {std::to_string(target_attr[0]), std::to_string(target_attr[1]), - std::to_string(target_attr[2])}); + {std::to_string(target_attr[0]->value), std::to_string(target_attr[1]->value), + std::to_string(target_attr[2]->value)}); } { diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc index bd3047e2862c1..fe8127d60dc9a 100644 --- a/src/relay/backend/utils.cc +++ b/src/relay/backend/utils.cc @@ -73,7 +73,7 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfo") std::vector sids_v; sids_v.reserve(sids.size()); for (auto s : sids) { - sids_v.push_back(s); + sids_v.push_back(s.IntValue()); } std::vector virtual_devices_v; virtual_devices_v.reserve(device_types.size()); @@ -83,7 +83,7 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfo") std::vector size_in_bytes_v; size_in_bytes_v.reserve(sizes_in_bytes.size()); for (auto s : sizes_in_bytes) { - size_in_bytes_v.push_back(s); + size_in_bytes_v.push_back(s.IntValue()); } return StorageInfo(std::move(sids_v), std::move(virtual_devices_v), std::move(size_in_bytes_v)); diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc index 85892e8223af1..5c85b3b29df79 100644 --- a/src/relay/ir/expr.cc +++ b/src/relay/ir/expr.cc @@ -401,7 +401,7 @@ TupleGetItem WithFields(TupleGetItem tuple_get_item, Optional opt_tuple, if (!unchanged) { TupleGetItemNode* cow_tuple_get_item_node = tuple_get_item.CopyOnWrite(); cow_tuple_get_item_node->tuple = tuple; - cow_tuple_get_item_node->index = index; + cow_tuple_get_item_node->index = index.IntValue(); cow_tuple_get_item_node->span = span; cow_tuple_get_item_node->virtual_device_ = virtual_device; } diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 4d5f52e61cf0d..989ab2ad25d36 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -370,7 +370,7 @@ bool StackRel(const Array& types, int num_inputs, const Attrs& attrs, const int ndim = static_cast(first->shape.size()); // Sanity check: axis - int axis = param->axis; + int axis = param->axis.IntValue(); ICHECK(-(ndim + 1) <= axis && axis < ndim + 1) << "stack only accepts `axis` in [-(ndim+1), ndim+1)" << ", but got axis = " << axis << ", and ndim = " << ndim; @@ -414,7 +414,7 @@ Array StackCompute(const Attrs& attrs, const Array& inpu const Type& out_type) { const StackAttrs* param = attrs.as(); ICHECK(param != nullptr); - return {topi::stack(inputs, param->axis)}; + return {topi::stack(inputs, param->axis.IntValue())}; } Expr MakeStack(Expr data, int axis) { @@ -473,7 +473,7 @@ bool TransposeRel(const Array& types, int num_inputs, const Attrs& attrs, } else { std::vector axis_used(ndim, 0); for (const Integer& e : axes) { - int64_t axis = e; + int64_t axis = e.IntValue(); // sanity check for axis and ndim ICHECK(-ndim <= axis && axis < ndim) << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)" @@ -1337,10 +1337,11 @@ Array TakeCompute(const Attrs& attrs, const Array& input const auto* param = attrs.as(); ICHECK(param != nullptr); if (!param->axis.defined()) { - return Array{topi::take(inputs[0], inputs[1], param->batch_dims, param->mode)}; - } else { return Array{ - topi::take(inputs[0], inputs[1], param->batch_dims, param->axis, param->mode)}; + topi::take(inputs[0], inputs[1], param->batch_dims.IntValue(), param->mode)}; + } else { + return Array{topi::take(inputs[0], inputs[1], param->batch_dims.IntValue(), + param->axis.IntValue(), param->mode)}; } } @@ -1658,8 +1659,8 @@ bool RepeatRel(const Array& types, int num_inputs, const Attrs& attrs, } const auto* param = attrs.as(); const int ndim = static_cast(data->shape.size()); - const int repeats = param->repeats; - const int axis = param->axis; + const int repeats = param->repeats.IntValue(); + const int axis = param->axis.IntValue(); ICHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`" << ", but got repeats = " << repeats; ICHECK(-ndim - 1 <= axis && axis <= ndim) @@ -1687,7 +1688,7 @@ Array RepeatCompute(const Attrs& attrs, const Array& inp const Type& out_type) { const RepeatAttrs* param = attrs.as(); ICHECK(param != nullptr); - return {topi::repeat(inputs[0], param->repeats, param->axis)}; + return {topi::repeat(inputs[0], param->repeats.IntValue(), param->axis.IntValue())}; } Expr MakeRepeat(Expr data, int repeats, int axis) { @@ -2068,7 +2069,7 @@ bool ReverseRel(const Array& types, int num_inputs, const Attrs& attrs, } const auto* param = attrs.as(); const int ndim = static_cast(data->shape.size()); - const int axis = param->axis; + const int axis = param->axis.IntValue(); ICHECK(-ndim <= axis && axis < ndim) << "reverse only accepts `axis` in [-data.ndim, data.ndim - 1]" << ", but got axis = " << axis << ", and data.ndim = " << ndim; @@ -2081,7 +2082,7 @@ Array ReverseCompute(const Attrs& attrs, const Array& in const ReverseAttrs* param = attrs.as(); ICHECK(param != nullptr); // pass empty seq_length tensor to reverse_sequence - return {topi::reverse_sequence(inputs[0], te::Tensor(), param->axis)}; + return {topi::reverse_sequence(inputs[0], te::Tensor(), param->axis.IntValue())}; } Expr MakeReverse(Expr data, int axis) { @@ -2136,7 +2137,7 @@ bool ReverseSequenceRel(const Array& types, int num_inputs, const Attrs& a const auto* param = attrs.as(); const int ndim = static_cast(data->shape.size()); - int batch_axis = param->batch_axis; + int batch_axis = param->batch_axis.IntValue(); ICHECK(-ndim <= batch_axis && batch_axis < ndim) << "reverse_sequence only accepts `batch_axis` in [-data.ndim, data.ndim - 1]" << ", but got batch_axis = " << batch_axis << ", and data.ndim = " << ndim; @@ -2149,7 +2150,7 @@ bool ReverseSequenceRel(const Array& types, int num_inputs, const Attrs& a << ", but got dimension of batch_axis = " << data->shape[batch_axis] << ", and seq_length size = " << seq_lengths->shape[0]; - const int seq_axis = param->seq_axis; + const int seq_axis = param->seq_axis.IntValue(); ICHECK(-ndim <= seq_axis && seq_axis < ndim) << "reverse_sequnece only accepts `seq_axis` in [-data.ndim, data.ndim - 1]" << ", but got seq_axis = " << seq_axis << ", and data.ndim = " << ndim; @@ -2162,7 +2163,8 @@ Array ReverseSequenceCompute(const Attrs& attrs, const Array(); ICHECK(param != nullptr); - return {topi::reverse_sequence(inputs[0], inputs[1], param->seq_axis, param->batch_axis)}; + return {topi::reverse_sequence(inputs[0], inputs[1], param->seq_axis.IntValue(), + param->batch_axis.IntValue())}; } Expr MakeReverseSequence(Expr data, Expr seq_lengths, int seq_axis, int batch_axis) { @@ -2374,7 +2376,7 @@ InferCorrectLayoutOutput SqueezeInferCorrectLayout(const Attrs& attrs, if (new_in_layouts.defined() && old_in_layouts.defined()) { Array new_axis; for (const auto& e : axis) { - const auto& dim = old_in_layouts[0][e]; + const auto& dim = old_in_layouts[0][e.IntValue()]; new_axis.push_back((new_in_layouts[0]).IndexOf(dim)); } params->axis = new_axis; @@ -2714,7 +2716,7 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout( Array new_axes; for (size_t i = 0; i < axes.size(); ++i) { - auto old_idx = axes[i]; + auto old_idx = axes[i].IntValue(); auto new_idx = new_layout.IndexOf(layout[old_idx]); new_begin.push_back(begin[i]); new_end.push_back(end[i]); @@ -2765,7 +2767,7 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout( auto axes = params->axes.value(); Array new_axes; for (size_t i = 0; i < axes.size(); ++i) { - auto old_idx = axes[i]; + auto old_idx = axes[i].IntValue(); auto new_idx = new_layout.IndexOf(layout[old_idx]); new_axes.push_back(new_idx); @@ -2783,8 +2785,8 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout( return out_default; } } - int64_t bg = begin[i]; - int64_t ed = end[i]; + int64_t bg = begin[i].IntValue(); + int64_t ed = end[i].IntValue(); if (bg % factor || ed % factor) { // transform to original layout return out_default; @@ -2801,8 +2803,8 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout( ICHECK(axis.IsPrimal()); auto factor = new_layout.FactorOf(axis); if (factor == -1) { - new_begin.push_back(IntImm(begin[i]->dtype, begin[i])); - new_end.push_back(IntImm(end[i]->dtype, end[i])); + new_begin.push_back(IntImm(begin[i]->dtype, begin[i].IntValue())); + new_end.push_back(IntImm(end[i]->dtype, end[i].IntValue())); } else { if (strides.defined() && i < strides.size()) { auto stride = strides[i]; @@ -3251,17 +3253,17 @@ Array SliceLikeCompute(const Attrs& attrs, const Array& } } } else { - for (int axis : param->axes) { - if (axis < 0) { - axis = static_cast(src_shape.size()) + axis; + for (Integer axis : param->axes) { + int a = axis.IntValue(); + if (a < 0) { + a = static_cast(src_shape.size()) + a; } - ICHECK(target_shape[axis]->IsInstance()) + ICHECK(target_shape[a]->IsInstance()) << "slice_like does not support dynamic output shape"; - end_idx.Set(axis, topi::GetConstInt(target_shape[axis])); - ICHECK_LE(topi::GetConstInt(end_idx[axis]), topi::GetConstInt(src_shape[axis])) - << "End index of axis " << axis - << " exceeds input shape: " << topi::GetConstInt(end_idx[axis]) << " vs " - << topi::GetConstInt(src_shape[axis]); + end_idx.Set(a, topi::GetConstInt(target_shape[a])); + ICHECK_LE(topi::GetConstInt(end_idx[a]), topi::GetConstInt(src_shape[a])) + << "End index of axis " << a << " exceeds input shape: " << topi::GetConstInt(end_idx[a]) + << " vs " << topi::GetConstInt(src_shape[a]); } } return Array{topi::strided_slice(inputs[0], begin_idx, end_idx, strides, "end")}; @@ -3515,7 +3517,7 @@ bool GatherRel(const Array& types, int num_inputs, const Attrs& attrs, Array GatherCompute(const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); - return {topi::gather(inputs[0], param->axis, inputs[1])}; + return {topi::gather(inputs[0], param->axis.IntValue(), inputs[1])}; } Expr MakeGather(Expr data, Integer axis, Expr indices) { @@ -3594,7 +3596,7 @@ Array GatherNDCompute(const Attrs& attrs, const Array& i const Type& out_type) { const auto* param = attrs.as(); ICHECK(param); - return {topi::gather_nd(inputs[0], inputs[1], param->batch_dims)}; + return {topi::gather_nd(inputs[0], inputs[1], param->batch_dims.IntValue())}; } Expr MakeGatherND(Expr data, Expr indices, int batch_dims = 0, diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc index 70d8820612995..8979f939c32ed 100644 --- a/src/relay/op/vision/yolo.cc +++ b/src/relay/op/vision/yolo.cc @@ -81,7 +81,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE) const Type& out_type) { const auto* params = attrs.as(); ICHECK(params != nullptr); - return Array{topi::vision::reorg(inputs[0], params->stride)}; + return Array{topi::vision::reorg(inputs[0], params->stride.IntValue())}; }); } // namespace relay diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc index 8601264f53130..2a6153e810963 100644 --- a/src/relay/qnn/op/requantize.cc +++ b/src/relay/qnn/op/requantize.cc @@ -91,7 +91,7 @@ InferCorrectLayoutOutput RequantizeInferCorrectLayout(const Attrs& attrs, Layout channel_layout = Layout("C"); input_layouts = {new_layout, channel_layout, channel_layout, channel_layout, channel_layout}; output_layouts = {new_layout}; - param->axis = new_axis; + param->axis = new_axis.IntValue(); } else if (old_in_layouts.defined()) { // If the new layout is undefined, set the old layout as the inferred layout. ICHECK_EQ(old_in_layouts.size(), 5); diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc index e25b8db152c49..1ced0883a14cb 100644 --- a/src/relay/transforms/fuse_ops.cc +++ b/src/relay/transforms/fuse_ops.cc @@ -1057,7 +1057,8 @@ Pass FuseOps(int fuse_opt_level) { link_params = pc->GetConfig("relay.FuseOps.link_params", Bool(link_params)).value(); int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level; auto max_fuse_depth = pc->GetConfig("relay.FuseOps.max_depth", Integer(kMaxFusedOps)); - return Downcast(FuseOps(f, opt_level, max_fuse_depth.value(), link_params, m)); + return Downcast( + FuseOps(f, opt_level, max_fuse_depth.value().IntValue(), link_params, m)); }; return CreateFunctionPass(pass_func, 0, "FuseOps", {"InferType"}); } diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc index 209639dd8f83a..04d0edb26d753 100644 --- a/src/relay/transforms/simplify_expr.cc +++ b/src/relay/transforms/simplify_expr.cc @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -334,7 +335,7 @@ class SimplifyTranspose : public DFPatternRewrite { if (auto attr = call->attrs.as()) { if (attr->axes.defined()) { for (int i = 0; i < ndim; ++i) { - int64_t axis = attr->axes[i]; + int64_t axis = attr->axes[i].IntValue(); axis += (axis < 0) ? ndim : 0; attr_axes.push_back(axis); } @@ -546,8 +547,10 @@ class ConcretizeCollapseSumLikeRewrite : public ConcretizeLikeRewrite { static const Op& op = Op::Get("collapse_sum_to"); auto attrs = make_object(); attrs->shape = shape; - auto cshape = - MakeConstantTensor(DataType::Int(32), {static_cast(shape.size())}, shape); + std::vector s; + std::transform(shape.begin(), shape.end(), std::back_inserter(s), + [](Integer i) { return i.IntValue(); }); + auto cshape = MakeConstantTensor(DataType::Int(32), {static_cast(shape.size())}, s); return Call(op, {node_map[data_pat_][0], cshape}, Attrs(attrs)); } }; diff --git a/src/target/build_common.h b/src/target/build_common.h index 6c94ec8703b73..35b3d92eb8149 100644 --- a/src/target/build_common.h +++ b/src/target/build_common.h @@ -57,7 +57,7 @@ inline std::unordered_map ExtractFuncInfo(co } } if (auto opt = f->GetAttr(tir::attr::kDeviceUseDynSharedMemory)) { - if (opt.value()) { + if (opt.value().IntValue() != 0) { info.launch_param_tags.push_back(runtime::launch_param::kUseDynamicSharedMemoryTag); } } diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc index 3d9ac835dc50f..83de839a926e1 100644 --- a/src/target/llvm/llvm_common.cc +++ b/src/target/llvm/llvm_common.cc @@ -159,7 +159,7 @@ std::unique_ptr GetLLVMTargetMachine(const Target& target, return nullptr; } - Integer llvm_opt_level = target->GetAttr("opt-level").value_or(Integer(3)); + int llvm_opt_level = target->GetAttr("opt-level").value_or(Integer(3)).IntValue(); llvm::CodeGenOpt::Level llvm_opt; if (llvm_opt_level <= 0) { llvm_opt = llvm::CodeGenOpt::None; diff --git a/src/target/metadata.h b/src/target/metadata.h index 7551592ac5ab4..b761f7ff2bbb8 100644 --- a/src/target/metadata.h +++ b/src/target/metadata.h @@ -154,7 +154,7 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo storage_.num_constant_pools = constant_pools.size(); for (size_t i = 0; i < constant_pools.size(); ++i) { constant_pools_.get()[i].name_hint = constant_pools[i]->name_hint.c_str(); - constant_pools_.get()[i].byte_offset = constant_pools[i]->byte_offset; + constant_pools_.get()[i].byte_offset = constant_pools[i]->byte_offset.IntValue(); std::string bytes; dmlc::MemoryStringStream stream(&bytes); diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc index e5ca82d5c0996..c8c099171c967 100644 --- a/src/target/metadata_module.cc +++ b/src/target/metadata_module.cc @@ -118,7 +118,7 @@ static runtime::metadata::Metadata ConvertMetaData( if (api->pool_info.as()) { pools.push_back( runtime::metadata::TensorInfo(make_object( - var->name_hint, std::vector{api->allocated_size}, + var->name_hint, std::vector{api->allocated_size.IntValue()}, tvm::runtime::DataType{kDLUInt, 8, 1}))); } } diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc index a76da36ea7250..0ec6179115195 100644 --- a/src/target/source/codegen_metal.cc +++ b/src/target/source/codegen_metal.cc @@ -67,7 +67,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) { // Buffer arguments size_t num_buffer = 0; - int limit = target_->GetAttr("max_function_args").value(); + int limit = target_->GetAttr("max_function_args").value().IntValue(); if (static_cast(f->params.size()) > limit) { LOG(WARNING) << "Probably you won't be able to execute your kernel due to high number of " "buffers in the kernel"; diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc index fef81c9bd69fe..fa38d9b9f4d15 100644 --- a/src/target/source/interface_c.cc +++ b/src/target/source/interface_c.cc @@ -177,14 +177,14 @@ class InterfaceCNode : public runtime::ModuleNode { return a->byte_offset->value < b->byte_offset->value; }); int64_t accumulated_pool_len = - const_info_vec.back()->byte_offset + + const_info_vec.back()->byte_offset.IntValue() + runtime::GetDataSize(*const_info_vec.back()->data.operator->()); const auto& accumulated_pool = runtime::NDArray::Empty( {accumulated_pool_len}, DataType::UInt(8), const_info_vec.back()->data->device); for (const auto& const_info : const_info_vec) { const auto& data = const_info->data; const auto& offs = const_info->byte_offset; - data.CopyToBytes(static_cast(accumulated_pool->data) + offs, + data.CopyToBytes(static_cast(accumulated_pool->data) + offs.IntValue(), runtime::GetDataSize(*data.operator->())); } diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index 6495c39ef1400..88a7a99b4c255 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -469,8 +469,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { String pool_name_tvmv = GenerateDLTensorStructWrapper(pool_name); code_ << "tensors[" << i << "] = " << pool_name_tvmv << ";\n"; } else { - code_ << "tensors[" << i << "] = ((TVMValue*)args)[" - << run_func_to_entry_point_args[Integer(i)] << "];\n"; + code_ << "tensors[" << i << "] = ((TVMValue*)args)[" << run_func_to_entry_point_args[i] + << "];\n"; } } } @@ -733,7 +733,7 @@ class MetadataSerializer : public AttrVisitor { switch (array->kind) { case MetadataKind::kUint64: { - int64_t i = Downcast(o); + int64_t i = Downcast(o).IntValue(); CHECK_GT(i, 0) << "Metadata is of type uint64_t, but array type contains a negative number"; uint64_t ui = static_cast(i); @@ -741,7 +741,7 @@ class MetadataSerializer : public AttrVisitor { continue; } case MetadataKind::kInt64: { - int64_t i = Downcast(o); + int64_t i = Downcast(o).IntValue(); Visit(nullptr, &i); continue; } diff --git a/src/target/spirv/build_vulkan.cc b/src/target/spirv/build_vulkan.cc index e922942e8acff..94f1bf16a25e7 100644 --- a/src/target/spirv/build_vulkan.cc +++ b/src/target/spirv/build_vulkan.cc @@ -42,8 +42,9 @@ class SPIRVTools { public: explicit SPIRVTools(Target target) { uint32_t vulkan_version = - target->GetAttr("vulkan_api_version").value_or(VK_API_VERSION_1_0); - uint32_t spirv_version = target->GetAttr("max_spirv_version").value_or(0x10000); + target->GetAttr("vulkan_api_version").value_or(VK_API_VERSION_1_0).IntValue(); + uint32_t spirv_version = + target->GetAttr("max_spirv_version").value_or(0x10000).IntValue(); spv_target_env validation_version; if (vulkan_version >= VK_API_VERSION_1_2) { diff --git a/src/target/spirv/spirv_support.cc b/src/target/spirv/spirv_support.cc index 33055e7399d56..a91a2a3384e0e 100644 --- a/src/target/spirv/spirv_support.cc +++ b/src/target/spirv/spirv_support.cc @@ -36,28 +36,32 @@ SPIRVSupport::SPIRVSupport(tvm::Target target) { << "SPIRVSupport can only be checked for vulkan device type"; if (target->GetAttr("vulkan_api_version")) { - vulkan_api_version = target->GetAttr("vulkan_api_version").value(); + vulkan_api_version = target->GetAttr("vulkan_api_version").value().IntValue(); } if (target->GetAttr("supported_subgroup_operations")) { supported_subgroup_operations = - target->GetAttr("supported_subgroup_operations").value(); + target->GetAttr("supported_subgroup_operations").value().IntValue(); } if (target->GetAttr("max_push_constants_size")) { - max_push_constants_size = target->GetAttr("max_push_constants_size").value(); + max_push_constants_size = + target->GetAttr("max_push_constants_size").value().IntValue(); } if (target->GetAttr("max_uniform_buffer_range")) { - max_uniform_buffer_range = target->GetAttr("max_uniform_buffer_range").value(); + max_uniform_buffer_range = + target->GetAttr("max_uniform_buffer_range").value().IntValue(); } if (target->GetAttr("max_storage_buffer_range")) { - max_storage_buffer_range = target->GetAttr("max_storage_buffer_range").value(); + max_storage_buffer_range = + target->GetAttr("max_storage_buffer_range").value().IntValue(); } if (target->GetAttr("max_shared_memory_per_block")) { - max_shared_memory_per_block = target->GetAttr("max_shared_memory_per_block").value(); + max_shared_memory_per_block = + target->GetAttr("max_shared_memory_per_block").value().IntValue(); } if (target->GetAttr("max_per_stage_descriptor_storage_buffer")) { max_per_stage_descriptor_storage_buffers = - target->GetAttr("max_per_stage_descriptor_storage_buffer").value(); + target->GetAttr("max_per_stage_descriptor_storage_buffer").value().IntValue(); } if (target->GetAttr("supports_storage_buffer_storage_class")) { supports_storage_buffer_storage_class = diff --git a/src/target/target.cc b/src/target/target.cc index 3cdfa0cc0d5e8..afdfad9b76b91 100644 --- a/src/target/target.cc +++ b/src/target/target.cc @@ -804,7 +804,7 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_map(attrs.at("from_device")); + int device_id = Downcast(attrs.at("from_device")).IntValue(); attrs.erase("from_device"); auto device_params = QueryDevice(device_id, target.get()); diff --git a/src/tir/analysis/calculate_workspace.cc b/src/tir/analysis/calculate_workspace.cc index 11593bb443a75..a667e2354b9b6 100644 --- a/src/tir/analysis/calculate_workspace.cc +++ b/src/tir/analysis/calculate_workspace.cc @@ -55,7 +55,8 @@ size_t WorkspaceCalculator::operator()(const PrimFunc& func) { template size_t WorkspaceCalculator::GetByteAlignedSize(Integer non_aligned_size) { return non_aligned_size.defined() - ? ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment + ? ((non_aligned_size.IntValue() + byte_alignment - 1) / byte_alignment) * + byte_alignment : 0; } diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 09c359c55abba..609d986dbb84f 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -214,7 +214,7 @@ tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements) "pass in conjunction with the LowerToTIR() pass."; auto value = max_copy_movements.value_or( ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value()); - return CopyComputeReorderingMutator(value)(f); + return CopyComputeReorderingMutator(value.IntValue())(f); }; return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, "tir.contrib.ethos-u.CopyComputeReordering", {}); diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc index 5a8d452f14b85..6a7b59cfec96b 100644 --- a/src/tir/schedule/primitive/cache_read_write.cc +++ b/src/tir/schedule/primitive/cache_read_write.cc @@ -1233,7 +1233,7 @@ struct ReIndexTraits : public UnpackedInstTraits { static BlockRV UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer buffer_index, Integer buffer_index_type) { - return sch->ReIndex(block, buffer_index, + return sch->ReIndex(block, buffer_index.IntValue(), static_cast(buffer_index_type->value)); } diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc index 692f68a600ae9..639593ab3e74c 100644 --- a/src/tir/schedule/primitive/layout_transformation.cc +++ b/src/tir/schedule/primitive/layout_transformation.cc @@ -548,7 +548,7 @@ struct TransformLayoutTraits : public UnpackedInstTraits static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index, Integer buffer_index_type, IndexMap index_map) { - return sch->TransformLayout(block_rv, buffer_index, + return sch->TransformLayout(block_rv, buffer_index.IntValue(), static_cast(buffer_index_type->value), index_map); } @@ -639,7 +639,7 @@ struct SetAxisSeparatorTraits : public UnpackedInstTraits axis_separators) { - return sch->SetAxisSeparator(block_rv, buffer_index, + return sch->SetAxisSeparator(block_rv, buffer_index.IntValue(), static_cast(buffer_index_type->value), axis_separators); } diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc index b7ea3f539bce9..1961565aac75e 100644 --- a/src/tir/schedule/primitive/sampling.cc +++ b/src/tir/schedule/primitive/sampling.cc @@ -184,7 +184,7 @@ int64_t SampleCategorical(support::LinearCongruentialEngine::TRandState* rand_st } *decision = Integer(i); // decision is guaranteed not to be nullptr. - return candidates[i]; + return candidates[i].IntValue(); } std::function MakeMultinomialSampler( diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index 67d0f55f20b9f..436d529abdc55 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -284,7 +284,7 @@ Optional TileWithTensorIntrin(const tir::Schedule& sch, const tir::Block ICHECK_EQ(split.size(), 2); inner_loops.insert(sch->GetSRef(split[1]).operator->()); // The inner split will be reordered to the loop domain that is tensorized - int desc_loop_index = info->desc_loop_indexer.at(GetRef(desc_loop)); + int desc_loop_index = info->desc_loop_indexer.at(GetRef(desc_loop)).IntValue(); reorder_suffix[desc_loop_index] = split[1]; } // Reorder the loops diff --git a/src/tir/transforms/inject_software_pipeline.cc b/src/tir/transforms/inject_software_pipeline.cc index de9aa79583b46..b4a597fe97d86 100644 --- a/src/tir/transforms/inject_software_pipeline.cc +++ b/src/tir/transforms/inject_software_pipeline.cc @@ -772,7 +772,7 @@ class PipelineInjector : private StmtExprMutator { auto it = op->annotations.find(attr::double_buffer_scope); if (it != op->annotations.end()) { - int buffer_index = Downcast((*it).second); + int buffer_index = Downcast((*it).second).IntValue(); CHECK(buffer_index >= 0 && static_cast(buffer_index) < op->writes.size()) << "ValueError: Index of the buffer exceeds the size of the write regions of the block. (" << buffer_index << " vs. " << op->writes.size() << ")"; diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc index 7e09943d01858..aeb819c5168d3 100644 --- a/src/tir/transforms/lower_thread_allreduce.cc +++ b/src/tir/transforms/lower_thread_allreduce.cc @@ -62,7 +62,8 @@ class UpdatePointerStorageScopeAllReduce final : public UpdatePointerStorageScop class ThreadAllreduceBuilder final : public StmtExprMutator { public: explicit ThreadAllreduceBuilder(const TargetNode* target) - : target_(target), warp_size_(target->GetAttr("thread_warp_size", 1).value()) {} + : target_(target), + warp_size_(target->GetAttr("thread_warp_size", 1).value().IntValue()) {} Stmt VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == attr::thread_extent) { diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc index d8250cd09888e..408cdbd04ec77 100644 --- a/src/tir/transforms/lower_warp_memory.cc +++ b/src/tir/transforms/lower_warp_memory.cc @@ -472,7 +472,7 @@ Pass LowerWarpMemory() { auto* n = f.CopyOnWrite(); auto target = f->GetAttr(tvm::attr::kTarget); ICHECK(target.defined()) << "LowerWarpMemory: Require the target attribute"; - int warp_size = target.value()->GetAttr("thread_warp_size", 1).value(); + int warp_size = target.value()->GetAttr("thread_warp_size", 1).value().IntValue(); WarpMemoryRewriter warp_memory_rewriter(warp_size); auto stmt = warp_memory_rewriter.Rewrite(std::move(n->body)); n->body = UpdatePointerStorageScope(warp_memory_rewriter.new_storage_scopes_)(stmt); diff --git a/src/tir/usmp/algo/greedy.cc b/src/tir/usmp/algo/greedy.cc index cae01ee859696..ec4f5a5d7215a 100644 --- a/src/tir/usmp/algo/greedy.cc +++ b/src/tir/usmp/algo/greedy.cc @@ -74,7 +74,7 @@ bool GreedyBase::IsValidPlacement(const PoolInfo& candidate_pool, const size_t& // this means pool is not bounded return true; } - auto pool_size = static_cast(size_hint_bytes); + auto pool_size = static_cast(size_hint_bytes.IntValue()); auto max_address = next_offset + size_bytes; if (max_address <= pool_size) { return true; @@ -124,7 +124,8 @@ Map GreedyBase::PostSortAllocation( // We only look at already allocated BufferInfo in-terms of conflicts. if (pool_allocations.count(conflict_buf_info)) { auto pool_allocation = pool_allocations[conflict_buf_info]; - next_offset = pool_allocation->byte_offset + conflict_buf_info->size_bytes; + next_offset = + pool_allocation->byte_offset.IntValue() + conflict_buf_info->size_bytes.IntValue(); next_offset = round_up_to_byte_alignment(next_offset, conflict_buf_info->alignment->value); // Checks whether the next offset in the same pool as the conflicting BufferInfo is valid. if (IsValidPlacement(pool_allocation->pool_info, next_offset, @@ -169,7 +170,7 @@ class GreedySize : public GreedyBase { return a->conflicts.size() > b->conflicts.size(); } } - return a->size_bytes > b->size_bytes; + return a->size_bytes.IntValue() > b->size_bytes.IntValue(); }); return PostSortAllocation(buffer_info_vec); } diff --git a/src/tir/usmp/algo/hill_climb.cc b/src/tir/usmp/algo/hill_climb.cc index c4ed73eb2feb2..8234074f9c892 100644 --- a/src/tir/usmp/algo/hill_climb.cc +++ b/src/tir/usmp/algo/hill_climb.cc @@ -105,7 +105,8 @@ class HillClimbAllocator : public GreedyBase { for (const auto* conflict_buf_info : buf_conf) { size_t next_offset = 0; auto pool_allocation = pool_allocations[conflict_buf_info]; - next_offset = pool_allocation->byte_offset + conflict_buf_info->size_bytes; + next_offset = + pool_allocation->byte_offset.IntValue() + conflict_buf_info->size_bytes.IntValue(); next_offset = round_up_to_byte_alignment(next_offset, conflict_buf_info->alignment->value); if (!pool_offset_candidates.count(pool_allocation->pool_info)) { continue; @@ -114,8 +115,8 @@ class HillClimbAllocator : public GreedyBase { buf_info->size_bytes->value)) { if (next_offset > pool_offset_candidates[pool_allocation->pool_info] && pool_offset_candidates[pool_allocation->pool_info] + - static_cast(buf_info->size_bytes) > - static_cast(pool_allocation->byte_offset)) { + static_cast(buf_info->size_bytes.IntValue()) > + static_cast(pool_allocation->byte_offset.IntValue())) { pool_offset_candidates[pool_allocation->pool_info] = next_offset; } } else { @@ -138,7 +139,7 @@ class HillClimbAllocator : public GreedyBase { for (const auto& it : *pool_allocations) { const BufferInfoNode* buf = it.first; const PoolAllocation& pa = it.second; - size_t high_sz = pa->byte_offset + buf->size_bytes; + size_t high_sz = pa->byte_offset.IntValue() + buf->size_bytes.IntValue(); if (pool_sizes[pa->pool_info] <= high_sz) { pool_sizes[pa->pool_info] = high_sz; } @@ -277,7 +278,7 @@ class HillClimbAllocator : public GreedyBase { for (const auto& it : pool_allocations) { const auto* buf = it.first; const auto pa = it.second; - size_t high_sz = pa->byte_offset + buf->size_bytes; + size_t high_sz = pa->byte_offset.IntValue() + buf->size_bytes.IntValue(); if (pool_sizes[pa->pool_info] == high_sz) { max_pool_buf.push_back(buf); } @@ -325,7 +326,7 @@ class HillClimbAllocator : public GreedyBase { Map HillClimb(const Array& buffer_info_arr, const Integer& memory_pressure) { - return HillClimbAllocator(memory_pressure).PlanMemory(buffer_info_arr); + return HillClimbAllocator(memory_pressure.IntValue()).PlanMemory(buffer_info_arr); } TVM_REGISTER_GLOBAL("tir.usmp.algo.hill_climb") diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc index 4e98116f8a17a..ba8f6aa911f14 100644 --- a/src/tir/usmp/analysis/extract_buffer_info.cc +++ b/src/tir/usmp/analysis/extract_buffer_info.cc @@ -369,11 +369,11 @@ void BufferInfoExtractor::VisitStmt_(const ForNode* op) { update_call = ai.call; } if (scope_stack_.top().initial_stmt_of_the_nested_loops->value < - buffer_info_start_stmt_idx_[update_call][allocate]) { + buffer_info_start_stmt_idx_[update_call][allocate].IntValue()) { buffer_info_start_stmt_idx_[update_call].Set( allocate, scope_stack_.top().initial_stmt_of_the_nested_loops->value); } - if (current_stmt_idx_ > buffer_info_end_stmt_idx_[update_call][allocate]) { + if (current_stmt_idx_ > buffer_info_end_stmt_idx_[update_call][allocate].IntValue()) { buffer_info_end_stmt_idx_[update_call].Set(allocate, current_stmt_idx_); } } @@ -518,7 +518,7 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) { LivenessEvent le_event_start; le_event_start.buffer_info = buffer_info; le_event_start.le_type = START; - le_event_start.tick = buffer_info_starts[allocate]; + le_event_start.tick = buffer_info_starts[allocate].IntValue(); le_events_timeline.push_back(le_event_start); } } @@ -529,7 +529,7 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) { LivenessEvent le_event_end; le_event_end.buffer_info = buffer_info; le_event_end.le_type = END; - le_event_end.tick = buffer_info_ends[allocate]; + le_event_end.tick = buffer_info_ends[allocate].IntValue(); le_events_timeline.push_back(le_event_end); } } @@ -562,13 +562,13 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) { le_event.buffer_info->conflicts.push_back(open_buffer_info); } } - open_set_size += le_event.buffer_info->size_bytes; + open_set_size += le_event.buffer_info->size_bytes.IntValue(); if (open_set_size > max_open_set_size) { max_open_set_size = open_set_size; } open_set.insert(le_event.buffer_info); } else { - open_set_size -= le_event.buffer_info->size_bytes; + open_set_size -= le_event.buffer_info->size_bytes.IntValue(); open_set.erase(le_event.buffer_info); } } diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc index 24a55190d326e..601e347196327 100644 --- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc +++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc @@ -57,10 +57,10 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator { size_t extent_size = -1; if (kv.first->IsInstance()) { Allocate allocate_node = Downcast(kv.first); - extent_size = CalculateExtentsSize(allocate_node.operator->()); + extent_size = CalculateExtentsSize(allocate_node.operator->()).IntValue(); } else if (kv.first->IsInstance()) { AllocateConst allocate_const_node = Downcast(kv.first); - extent_size = CalculateExtentsSize(allocate_const_node.operator->()); + extent_size = CalculateExtentsSize(allocate_const_node.operator->()).IntValue(); } else { ICHECK(false) << "Not supported node type " << kv.first->GetTypeKey(); } diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc index 6f95c7cbaf66c..3350ecc5d47f1 100644 --- a/src/tir/usmp/utils.cc +++ b/src/tir/usmp/utils.cc @@ -228,14 +228,14 @@ class ModuleWorkspaceSizeCalculator : public StmtExprVisitor { Integer workspace_byte_alignment = tgt->GetAttr("workspace-byte-alignment").value_or(16); Integer workspace_req = CalculateWorkspaceBytes(func, workspace_byte_alignment); - if (workspace_req) { + if (workspace_req.IntValue() != 0) { current_workspace_size_ += workspace_req->value; } if (max_workspace_size < current_workspace_size_) { max_workspace_size = current_workspace_size_; } this->VisitStmt(func->body); - if (workspace_req) { + if (workspace_req.IntValue() != 0) { current_workspace_size_ -= workspace_req->value; } } diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc index 32ec346c8796a..f6c4fb4b67d64 100644 --- a/tests/cpp/container_test.cc +++ b/tests/cpp/container_test.cc @@ -26,8 +26,14 @@ #include #include +#include +#include +#include +#include #include +#include #include +#include #include using namespace tvm; @@ -342,7 +348,7 @@ TEST(Map, Insert) { ICHECK_EQ(result.size(), expected.size()); for (const auto& kv : result) { ICHECK(expected.count(kv.first)); - ICHECK_EQ(expected[kv.first], kv.second.operator int64_t()); + ICHECK_EQ(expected[kv.first], kv.second.IntValue()); expected.erase(kv.first); } }; @@ -364,12 +370,14 @@ TEST(Map, Erase) { ICHECK_EQ(result.size(), expected.size()); for (const auto& kv : result) { ICHECK(expected.count(kv.first)); - ICHECK_EQ(expected[kv.first], kv.second.operator int64_t()); + ICHECK_EQ(expected[kv.first], kv.second.IntValue()); expected.erase(kv.first); } }; Map map{{"a", 1}, {"b", 2}, {"c", 3}, {"d", 4}, {"e", 5}}; - std::unordered_map stl(map.begin(), map.end()); + std::unordered_map stl; + std::transform(map.begin(), map.end(), std::inserter(stl, stl.begin()), + [](auto&& p) { return std::make_pair(p.first, p.second.IntValue()); }); for (char c = 'a'; c <= 'e'; ++c) { Map result = map; std::unordered_map expected(stl); From 111169c7df2831ab8ee40d5388ebcfcf551fd86f Mon Sep 17 00:00:00 2001 From: Jinkun Lin Date: Tue, 5 Jul 2022 21:50:25 -0400 Subject: [PATCH 055/111] Fix infercorrect layout in Layoutrewrite and improve naming. (#12007) * Fix infercorrect layout in layoutrewrite. * Compatibility issue. * Fix lint. * Better naming and detailed comments. * Add unittest. --- src/relay/transforms/transform_layout.h | 82 ++++++++++++------- .../python/relay/test_pass_alter_op_layout.py | 12 +++ 2 files changed, 65 insertions(+), 29 deletions(-) diff --git a/src/relay/transforms/transform_layout.h b/src/relay/transforms/transform_layout.h index 66689ae38f661..117096e1334ab 100644 --- a/src/relay/transforms/transform_layout.h +++ b/src/relay/transforms/transform_layout.h @@ -319,14 +319,23 @@ Expr LayoutRewriter(const Call& ref_call, const Array& new_args, const Obj } } - // old_in, new_in = state[inputs] - // naming rule: - // old_in, new_in: the input layouts given by downstream node. - // old_in2, new_in2: the input layouts inferred by the current node. - Array old_in, old_in2, old_out, new_in, new_out, new_in2; + // old_prd, new_prd = state[inputs] + // different ops can view a tensor with different layouts, e.g. conv_1->transpose(H, W)->conv_2 + // transpose view its output having NCWH layout, but conv_2 still views it as NCHW to operate + // old_prd, new_prd: the input layouts from the perspective of the producer (transpose) + // old_cur, new_cur: the input layouts from the perspective of the current node (conv_2) + // old_prd->new_prd tells how producer changed the layout + // old_cur->new_cur tells what change the current node wants to see + // No layout transforms are needed when they mean the same (NCHW->NCHW4c == NCWH->NCWH4c) + + // The workflow: + // 1. Run InferCorrectLayouts(NULL, old_prd) to get old_cur + // 2. Run InferCorrectLayouts(new_prd, old_prd) to get new_cur and rewrite the current op + + Array old_prd, old_cur, old_out, new_prd, new_out, new_cur; for (auto inp : inputs) { - old_in.push_back(inp->old_layout); - new_in.push_back(inp->new_layout); + old_prd.push_back(inp->old_layout); + new_prd.push_back(inp->new_layout); } // Collect input types to pass on to Infer Correct Layout. @@ -338,30 +347,39 @@ Expr LayoutRewriter(const Call& ref_call, const Array& new_args, const Obj bool success = false; InferCorrectLayoutOutput infer_out; std::tie(infer_out, success) = - InferCorrectLayouts(ref_call, Array(nullptr), old_in, types); - old_in2 = infer_out->input_layouts; + InferCorrectLayouts(ref_call, Array(nullptr), old_prd, types); + old_cur = infer_out->input_layouts; old_out = infer_out->output_layouts; if (!success) { return Expr(nullptr); } - ICHECK_EQ(old_in2.size(), new_in.size()); - - Array new_in_tmp = new_in; // for backward compatibility of InferCorrectLayouts - // if new_in_tmp == 'undef': new_in_tmp = old_in2 - for (size_t i = 0; i < new_in_tmp.size(); ++i) { - if (!new_in_tmp[i].defined()) { - new_in_tmp.Set(i, old_in2[i]); + ICHECK_EQ(old_cur.size(), new_prd.size()); + + // for backward compatibility of InferCorrectLayouts + Array new_prd_inferred = new_prd; + // if new_prd_inferred == 'undef': new_prd_inferred = old_cur + for (size_t i = 0; i < new_prd_inferred.size(); ++i) { + if (!new_prd_inferred[i].defined()) { + new_prd_inferred.Set(i, old_cur[i]); + } + } + Array old_prd_inferred = old_prd; + // if old_prd_inferred == 'undef': old_prd_inferred = old_cur + for (size_t i = 0; i < old_prd_inferred.size(); ++i) { + if (!old_prd_inferred[i].defined()) { + old_prd_inferred.Set(i, old_cur[i]); } } // new_op = alter(op) Call new_call = memorizer->CallWithNewLayouts(ref_call, infer_out->new_attrs, normal_new_args); - // new_in2, new_out = op.infer(new_in) + // new_cur, new_out = op.infer(new_prd) if (new_call->op->IsInstance()) { success = false; - std::tie(infer_out, success) = InferCorrectLayouts(new_call, new_in_tmp, old_in2, types); - new_in2 = infer_out->input_layouts; + std::tie(infer_out, success) = + InferCorrectLayouts(new_call, new_prd_inferred, old_prd_inferred, types); + new_cur = infer_out->input_layouts; new_out = infer_out->output_layouts; if (!success) { return Expr(nullptr); @@ -372,21 +390,27 @@ Expr LayoutRewriter(const Call& ref_call, const Array& new_args, const Obj ICHECK_EQ(new_out.size(), old_out.size()) << "The number of output nodes should keep the same during alter_op_layout"; - ICHECK_EQ(new_in.size(), new_in2.size()) + ICHECK_EQ(new_prd.size(), new_cur.size()) << "The number of input nodes should keep the same during alter_op_layout"; - auto transform_layout = [&memorizer](Expr arg_item, const Layout& old_in, const Layout& old_in2, - const Layout& new_in, const Layout& new_in2) { - if (old_in2.Equals(old_in)) { // the two transforms can be fused to one - arg_item = memorizer.Transform(arg_item, new_in, new_in2); + auto transform_layout = [&memorizer](Expr arg_item, const Layout& old_prd, const Layout& old_cur, + const Layout& new_prd, const Layout& new_cur) { + if (old_cur.Equals(old_prd)) { // the two transforms can be fused to one + arg_item = memorizer.Transform(arg_item, new_prd, new_cur); } else { - if (old_in.defined()) arg_item = memorizer.Transform(arg_item, new_in, old_in); - arg_item = memorizer.Transform(arg_item, old_in2, new_in2); + if (old_prd.defined()) arg_item = memorizer.Transform(arg_item, new_prd, old_prd); + arg_item = memorizer.Transform(arg_item, old_cur, new_cur); } return arg_item; }; - // if (new_in != new_in2): insert transform (new_in -> new_in2) + DLOG(INFO) << "Transforming layout for `" << ref_call->op << "`"; + DLOG(INFO) << " old_prd=" << old_prd; + DLOG(INFO) << " new_prd=" << new_prd; + DLOG(INFO) << " old_cur=" << old_cur; + DLOG(INFO) << " new_cur=" << new_cur; + + // if (new_prd != new_cur): insert transform (new_prd -> new_cur) Array transformed_args; size_t pt = 0; for (auto arg : new_call->args) { @@ -396,13 +420,13 @@ Expr LayoutRewriter(const Call& ref_call, const Array& new_args, const Obj transformed_tuple_arg.reserve(tuple_arg->fields.size()); for (auto arg_item : tuple_arg->fields) { transformed_tuple_arg.push_back( - transform_layout(arg_item, old_in[pt], old_in2[pt], new_in[pt], new_in2[pt])); + transform_layout(arg_item, old_prd[pt], old_cur[pt], new_prd[pt], new_cur[pt])); pt++; } transformed_args.push_back(WithFields(tuple_arg, transformed_tuple_arg)); } else { transformed_args.push_back( - transform_layout(arg, old_in[pt], old_in2[pt], new_in[pt], new_in2[pt])); + transform_layout(arg, old_prd[pt], old_cur[pt], new_prd[pt], new_cur[pt])); pt++; } } diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index 5aff77ad36f56..3fd7cb69771b1 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -1935,5 +1935,17 @@ def test_alter_with_subfunc(): assert tvm.ir.structural_equal(relay.transform.AlterOpLayout()(mod), mod) +def test_alter_with_reduce(): + x = relay.var("x", shape=(1, 1, 1, 1)) + y = relay.image.resize2d(x, (2, 4)) + z = relay.mean(y, axis=0) + a = relay.image.resize1d(z, (1,)) + func = relay.Function((x,), a) + mod = tvm.IRModule.from_expr(func) + mod = relay.transform.InferType()(mod) + with tvm.transform.PassContext(opt_level=4): + relay.build(mod, target="llvm") + + if __name__ == "__main__": pytest.main([__file__]) From c57320bd9d399d72f7c919c17e7876a4bbdc1c12 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 5 Jul 2022 23:52:06 -0500 Subject: [PATCH 056/111] [CI] Allow command-line argument or TVM_BUILD_PATH for C++ unittests (#12011) * [CI] Use command-line argument or TVM_BUILD_PATH for C++ unittests Previously, the `ci.py` script would execute all C++ unit tests in the `"build"` directory, regardless of the docker image being used. This change allows a caller to specify the build directory to be used by `task_cpp_unittest.sh`, either by the command line or by using the same `TVM_BUILD_PATH environment variable as used by the top-level Makefile, and passes this argument from `ci.py`. To preserve the existing behavior for the pre-commit CI, if no argument is passed and if the `TVM_BUILD_PATH` is undefined, `task_cpp_unittest.sh` defaults to the `"build"` directory. Python unit tests executed through `ci.py` used the `TVM_LIBRARY_PATH` environment variable, and were not similarly affected. * Remove `name=name` in format script Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> * Fix lint error * Use default expansion of TVM_BUILD_PATH Otherwise, `set -u` rightly errors out for it being undefined. Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> --- tests/scripts/ci.py | 15 ++++++++++----- tests/scripts/task_cpp_unittest.sh | 17 ++++++++++++++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py index 1ffd2d20e7ae9..022d192002325 100755 --- a/tests/scripts/ci.py +++ b/tests/scripts/ci.py @@ -372,12 +372,14 @@ def fn( if precheck is not None: precheck() + build_dir = get_build_dir(name) + if skip_build: scripts = [] else: scripts = [ - f"./tests/scripts/task_config_build_{name}.sh {get_build_dir(name)}", - f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}", + f"./tests/scripts/task_config_build_{name}.sh {build_dir}", + f"./tests/scripts/task_build.py --build-dir {build_dir}", ] if post_build is not None: @@ -394,7 +396,7 @@ def fn( # Add named test suites for option_name, (_, extra_scripts) in options.items(): if kwargs.get(option_name, False): - scripts += extra_scripts + scripts.extend(script.format(build_dir=build_dir) for script in extra_scripts) docker( name=gen_name(f"ci-{name}"), @@ -553,7 +555,7 @@ def add_subparser( return subparser -CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh"]) +CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh {build_dir}"]) generated = [ generate_command( @@ -610,7 +612,10 @@ def add_subparser( generate_command( name="wasm", help="Run WASM build and test(s)", - options={"test": ("run WASM tests", ["./tests/scripts/task_web_wasm.sh"])}, + options={ + "cpp": CPP_UNITTEST, + "test": ("run WASM tests", ["./tests/scripts/task_web_wasm.sh"]), + }, ), generate_command( name="qemu", diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh index a28efb0328ec0..8ae2e9b1109f0 100755 --- a/tests/scripts/task_cpp_unittest.sh +++ b/tests/scripts/task_cpp_unittest.sh @@ -18,6 +18,16 @@ set -euxo pipefail +if [ $# -gt 0 ]; then + BUILD_DIR="$1" +elif [ -n "${TVM_BUILD_PATH:-}" ]; then + # TVM_BUILD_PATH may contain multiple space-separated paths. If + # so, use the first one. + BUILD_DIR=$(IFS=" "; set -- $TVM_BUILD_PATH; echo $1) +else + BUILD_DIR=build +fi + # Python is required by apps/bundle_deploy source tests/scripts/setup-pytest-env.sh @@ -32,16 +42,17 @@ export OMP_NUM_THREADS=1 # Build cpptest suite python3 tests/scripts/task_build.py \ --sccache-bucket tvm-sccache-prod \ - --cmake-target cpptest + --cmake-target cpptest \ + --build-dir "${BUILD_DIR}" # crttest requires USE_MICRO to be enabled, which is currently the case # with all CI configs -pushd build +pushd "${BUILD_DIR}" ninja crttest popd -pushd build +pushd "${BUILD_DIR}" ctest --gtest_death_test_style=threadsafe popd From c98626cbfa1936740dc829bb2e1d800094c10424 Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Wed, 6 Jul 2022 09:39:18 +0100 Subject: [PATCH 057/111] [USMP] HillClimb stability patch (#10547) This patch increases stability of the hill climb allocation algorithm Change-Id: I56414ae661fa856baeddce00f4717a9f5a9e2954 --- src/tir/usmp/algo/hill_climb.cc | 50 ++++++++----------- tests/python/relay/aot/test_crt_aot_usmp.py | 50 ++++++++++++++----- .../unittest/test_tir_usmp_algo_hill_climb.py | 12 +++-- 3 files changed, 67 insertions(+), 45 deletions(-) diff --git a/src/tir/usmp/algo/hill_climb.cc b/src/tir/usmp/algo/hill_climb.cc index 8234074f9c892..ed90430277ec2 100644 --- a/src/tir/usmp/algo/hill_climb.cc +++ b/src/tir/usmp/algo/hill_climb.cc @@ -44,6 +44,7 @@ namespace algo { * Works by continiously invoking 'greedy-by-size' allocation, * assessing the result, and introducing permutations to the allocation * order which hopefully will led to more 'compact' memory allocation. + * Do not forget to use srand for repeatable results */ class HillClimbAllocator : public GreedyBase { private: @@ -59,18 +60,18 @@ class HillClimbAllocator : public GreedyBase { /* * Initial sorting routine */ - void sort_vector(std::vector* buffer_info_vec) { - std::sort(buffer_info_vec->begin(), buffer_info_vec->end(), - [](const BufferInfo& a, const BufferInfo& b) { - if (a->size_bytes->value == b->size_bytes->value) { - if (a->conflicts.size() == b->conflicts.size()) { - return std::string(a->name_hint->data) > std::string(b->name_hint->data); - } else { - return a->conflicts.size() > b->conflicts.size(); - } - } - return a->size_bytes->value > b->size_bytes->value; - }); + template + void sort_vector(std::vector* buffer_info_vec) { + std::sort(buffer_info_vec->begin(), buffer_info_vec->end(), [](const T& a, const T& b) { + if (a->size_bytes->value == b->size_bytes->value) { + if (a->conflicts.size() == b->conflicts.size()) { + return std::string(a->name_hint->data) > std::string(b->name_hint->data); + } else { + return a->conflicts.size() > b->conflicts.size(); + } + } + return a->size_bytes->value > b->size_bytes->value; + }); } /* @@ -156,33 +157,21 @@ class HillClimbAllocator : public GreedyBase { void collect_neighbor_lists(const BufferInfoNode* buf, std::vector* first_level, std::vector* second_level, const TPos& _pos) { - std::unordered_map first_level_set; - std::unordered_map second_level_set; - auto buf_pos = _pos(buf); for (const auto& c1 : buf->conflicts) { const auto* c1_buf = c1.as(); int c1_pos = _pos(c1_buf); if (buf_pos > c1_pos) { - first_level_set[c1_pos] = c1_buf; + first_level->push_back(c1_buf); } int c2_pos = -1; for (const auto& c2 : c1_buf->conflicts) { const auto c2_buf = c2.as(); if (c1_pos > (c2_pos = _pos(c2_buf))) { - second_level_set[c2_pos] = c2_buf; + second_level->push_back(c2_buf); } } } - - // std::vector first_level; - for (const auto& i : first_level_set) { - first_level->push_back(i.second); - } - // std::vector second_level; - for (const auto& i : second_level_set) { - second_level->push_back(i.second); - } } public: @@ -202,7 +191,7 @@ class HillClimbAllocator : public GreedyBase { buffer_info_vec.push_back(std::move(buffer_info)); } - sort_vector(&buffer_info_vec); + sort_vector(&buffer_info_vec); // populate positional index map std::unordered_map _pos_map; @@ -283,12 +272,17 @@ class HillClimbAllocator : public GreedyBase { max_pool_buf.push_back(buf); } } - + sort(max_pool_buf.begin(), max_pool_buf.end(), + [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); }); // pick highest const BufferInfoNode* node = max_pool_buf[rnd_func() % max_pool_buf.size()]; std::vector first_level; std::vector second_level; collect_neighbor_lists(node, &first_level, &second_level, _pos); + sort(first_level.begin(), first_level.end(), + [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); }); + sort(second_level.begin(), second_level.end(), + [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); }); // retry if no first level neightbors were collected if (!first_level.size()) { diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py index 0d3426dceeaf2..724932183a541 100644 --- a/tests/python/relay/aot/test_crt_aot_usmp.py +++ b/tests/python/relay/aot/test_crt_aot_usmp.py @@ -18,6 +18,8 @@ from collections import OrderedDict import re + +import random import numpy as np import pytest @@ -100,23 +102,47 @@ def test_synthetic(interface_api, use_unpacked_api, test_runner): @pytest.mark.parametrize( - "workspace_byte_alignment,constant_byte_alignment,main_workspace_size,main_constant_size", + "workspace_byte_alignment,constant_byte_alignment," + "main_workspace_size,main_constant_size,usmp_algo", [ - (8, 8, 17280, 948), - (16, 8, 17280, 948), - (256, 8, 17792, 948), - (8, 16, 17280, 956), - (16, 16, 17280, 956), - (256, 16, 17792, 956), - (8, 256, 17280, 1804), - (16, 256, 17280, 1804), - (256, 256, 17792, 1804), + (8, 8, 17280, 948, "greedy_by_conflicts"), + (16, 8, 17280, 948, "greedy_by_conflicts"), + (256, 8, 17792, 948, "greedy_by_conflicts"), + (8, 16, 17280, 956, "greedy_by_conflicts"), + (16, 16, 17280, 956, "greedy_by_conflicts"), + (256, 16, 17792, 956, "greedy_by_conflicts"), + (8, 256, 17280, 1804, "greedy_by_conflicts"), + (16, 256, 17280, 1804, "greedy_by_conflicts"), + (256, 256, 17792, 1804, "greedy_by_conflicts"), + (8, 8, 22032, 948, "greedy_by_size"), + (16, 8, 22032, 948, "greedy_by_size"), + (256, 8, 22976, 948, "greedy_by_size"), + (8, 16, 22032, 956, "greedy_by_size"), + (16, 16, 22032, 956, "greedy_by_size"), + (256, 16, 22976, 956, "greedy_by_size"), + (8, 256, 22032, 1804, "greedy_by_size"), + (16, 256, 22032, 1804, "greedy_by_size"), + (256, 256, 22976, 1804, "greedy_by_size"), + (8, 8, 11424, 948, "hill_climb"), + (16, 8, 11424, 948, "hill_climb"), + (256, 8, 11920, 948, "hill_climb"), + (8, 16, 11424, 956, "hill_climb"), + (16, 16, 11424, 956, "hill_climb"), + (256, 16, 11920, 956, "hill_climb"), + (8, 256, 11424, 1804, "hill_climb"), + (16, 256, 11424, 1804, "hill_climb"), + (256, 256, 11920, 1804, "hill_climb"), ], ) def test_memory_planning( - workspace_byte_alignment, constant_byte_alignment, main_workspace_size, main_constant_size + workspace_byte_alignment, + constant_byte_alignment, + main_workspace_size, + main_constant_size, + usmp_algo, ): """Checks calculated workspace against known values""" + random.seed(0) mod, params = tvm.relay.testing.synthetic.get_workload() target = "c" runtime = Runtime("crt") @@ -133,7 +159,7 @@ def test_memory_planning( "tir.disable_vectorize": True, "tir.disable_storage_rewrite": True, "tir.usmp.enable": True, - "tir.usmp.algorithm": "greedy_by_conflicts", + "tir.usmp.algorithm": usmp_algo, }, ): lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params) diff --git a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py index b486581064f9f..6450673e71ddc 100644 --- a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py +++ b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py @@ -23,7 +23,7 @@ from tvm import WorkspacePoolInfo, PoolInfoProperties -def _check_max_workspace_size(buffer_pool_allocations, pool_info, size): +def _check_max_workspace_size(buffer_pool_allocations, pool_info, size, tolerance=0): """Helper to check maximum allocated memory size""" max_workspace_size = 0 for buffer_info, pool_allocation in buffer_pool_allocations.items(): @@ -33,7 +33,7 @@ def _check_max_workspace_size(buffer_pool_allocations, pool_info, size): max_workspace_size = size_candidate _diff = max_workspace_size.value - size return ( - (max_workspace_size.value == size), + (max_workspace_size.value == size if tolerance == 0 else tolerance > 100 * _diff / size), "'{}': expected {} got {}, diff {:0.2f}% ({} bytes)".format( pool_info.pool_name, size, max_workspace_size, 100 * _diff / size, _diff ), @@ -335,7 +335,7 @@ def find_maximum_from_intervals(intervals): def test_intervals(intervals): """Tests supplied intervals""" random.seed(0) - result = run_intervals(intervals) + result = run_intervals(intervals, 5) assert result["tir.usmp.algo.hill_climb"] == True, f" {result}" @@ -355,7 +355,7 @@ def test_random_intervals(interval_len=16): return run_intervals(intervals) -def run_intervals(intervals): +def run_intervals(intervals, tolerance=0): """Helper to run intervals""" expected_mem = find_maximum_from_intervals(intervals) pools = [WorkspacePoolInfo("default", [])] @@ -391,7 +391,9 @@ def run_intervals(intervals): print() _verify_all_conflicts(buffer_info_arr) - result[alg], msg = _check_max_workspace_size(buffer_info_arr, pools[0], expected_mem) + result[alg], msg = _check_max_workspace_size( + buffer_info_arr, pools[0], expected_mem, tolerance + ) if not result[alg]: print(alg, msg) From 95f578912f8e6a6f7199188e52ce98966b919f05 Mon Sep 17 00:00:00 2001 From: Anirudh Sundar Date: Wed, 6 Jul 2022 20:22:44 +0530 Subject: [PATCH 058/111] [Topi] [Hexagon] Conv2d slice op initial version (#11489) --- python/tvm/topi/hexagon/slice_ops/__init__.py | 1 + python/tvm/topi/hexagon/slice_ops/conv2d.py | 242 +++++++++++++ python/tvm/topi/hexagon/utils.py | 14 + .../test_hexagon/topi/test_conv2d_slice.py | 339 ++++++++++++++++++ 4 files changed, 596 insertions(+) create mode 100644 python/tvm/topi/hexagon/slice_ops/conv2d.py create mode 100755 tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index 5b5c0b84214eb..ce1641bfda35a 100755 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -23,3 +23,4 @@ from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule from .softmax_slice import * from .clip import * +from .conv2d import * diff --git a/python/tvm/topi/hexagon/slice_ops/conv2d.py b/python/tvm/topi/hexagon/slice_ops/conv2d.py new file mode 100644 index 0000000000000..439fd80648f9d --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/conv2d.py @@ -0,0 +1,242 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=line-too-long + +"""Hexagon slice conv2d compute and schedule""" +import typing + +import tvm +from tvm import te + +from ..utils import get_layout_transform_fn + + +def conv2d_compute( + activations: te.Tensor, + weights: te.Tensor, + out_shape: typing.Tuple, + stride: typing.Tuple, + dilation: typing.Tuple, + dtype: str, + output_name: str, + weights_width_reversed: bool = True, +) -> te.Tensor: + """Compute for slice conv2d op for hexagon. + + This op makes the following assumptions: + 1. This op is written for a sliced convolution with 2d physical buffers + 2. The input activations is assumed to be in NHWC layout and filter is in HWIO layout + 3. Grouped convolutions are not supported. and there will be a separate compute definition for depthwise convolution + 4. In order to get grouped convolutions, it is assumed that the op will be sliced according to the groups and multiple calls to this compute would be placed. + + + Parameters + ---------- + activations : te.Tensor + Input activations padded for inner dimension size + weights : te.Tensor + Weights without dilation + out_shape : typing.Tuple + The logical output shape without considering input padding + stride : typing.Tuple + stride + dilation : typing.Tuple + dilation + dtype : str + dtype + output_name : str + The name to be given to output. This would become the block name for the corresponding STIR compute + weights_width_reversed : bool + The width axis of weights are expected in reverse order if weights_width_reversed is True + + Returns + ------- + output : te.Tensor + Output of applying 2D convolution of Weights on Input + """ + + filt_shape = weights.shape + + reduce_channel = tvm.te.reduce_axis((0, filt_shape[2]), name="reduce_channel") + reduce_height = tvm.te.reduce_axis((0, filt_shape[0]), name="reduce_height") + reduce_width = tvm.te.reduce_axis((0, filt_shape[1]), name="reduce_width") + stride_height, stride_width = stride + dilation_height, dilation_width = dilation + + if weights_width_reversed: + weights_width_var = filt_shape[1] - reduce_width - 1 + else: + weights_width_var = reduce_width + + output = tvm.te.compute( + out_shape, + lambda n, h, w, c: tvm.te.sum( + ( + activations[ + n, + h * stride_height + reduce_height * dilation_height, + w * stride_width + reduce_width * dilation_width, + reduce_channel, + ] + * weights[reduce_height, weights_width_var, reduce_channel, c] + ).astype(dtype), + axis=[reduce_channel, reduce_height, reduce_width], + ), + name=output_name, + ) + return output + + +def conv2d_te_schedule( + out: te.Tensor, + ins: typing.List[te.Tensor], + transform_activation_layout: str, + transform_weights_layout: str, + transform_output_layout: str, +) -> te.Schedule: + """TE Schedule for the sliced conv2d op + + This schedule makes the following assumptions: + 1. There is only one output tensor + 2. The activations and weights have specific layouts defined by the last 2 arguments + 3. All transformation functions are expected to be a bijection for now + + Parameters + ---------- + out : te.Tensor + The output tensor returned by a call to conv2d_compute + ins : typing.List[te.Tensor] + The list of 2 Tensors which would be the input activations and weights + transform_activation_layout : str + The expected activations layout + transform_weights_layout : str + String representing the weights layout as defined in get_layout_transform_fn + transform_output_layout: str + String representing the output layout as defined in get_layout_transform_fn + + Returns + ------- + sch : te.Schedule + The TE schedule for slice conv2d + """ + activations, weights = ins + output = out + sch = tvm.te.create_schedule(output.op) + reduce_channel, reduce_height, reduce_width = sch[output].op.reduce_axis + sch[activations].transform_layout(get_layout_transform_fn(transform_activation_layout)) + sch[weights].transform_layout(get_layout_transform_fn(transform_weights_layout)) + transformed_axis = sch[output].transform_layout( + get_layout_transform_fn(transform_output_layout) + ) + fused_out_axis = sch[output].fuse(transformed_axis[-1], transformed_axis[-2]) + sch[output].reorder( + *[*transformed_axis[:-2], reduce_height, reduce_width, reduce_channel, fused_out_axis] + ) + # The below code doesn't work yet as vectorization across 2D boundary is not yet supported + # s[output].vectorize(fused_out_axis) + return sch + + +def conv2d_schedule( + outs: te.Tensor, + ins: typing.List[te.Tensor], + transform_activation_layout: str, + transform_weights_layout: str, + transform_output_layout: str, + output_name: str, +) -> tvm.tir.Schedule: + """STIR schedule definition for the compute defined above by conv2d_compute. + + - Auto-generated prim_func before applying schedule primitives for reference + - The below TVMScript code is for conv2d with padded input dimensions and a stride of 1x1 + + # from tvm.script import tir as T + @T.prim_func + def func(InputTensor: T.Buffer[(1, 24, 12, 32), "float16"], Weights: T.Buffer[(3, 3, 32, 32), "float16"], compute: T.Buffer[(1, 16, 8, 32), "float16"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 16, 8, 32, 32, 3, 3): + with T.block("compute"): + n, h, w, c, rc, rh, rw = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6]) + T.reads(InputTensor[n, h + rh, w + rw, rc], Weights[rh, rw, rc, c]) + T.writes(compute[n, h, w, c]) + with T.init(): + compute[n, h, w, c] = T.float16(0) + compute[n, h, w, c] = compute[n, h, w, c] + InputTensor[n, h + rh, w + rw, rc] * Weights[rh, rw, rc, c] + + Parameters + ---------- + outs : te.Tensor + The output Tensor as returned by a call to conv2d_compute + ins : typing.List[te.Tensor] + This is a list of 2 tensors - Input activations and Weights + transform_activation_layout : str + String representing the activations layout as defined in get_layout_transform_fn + transform_weights_layout : str + String representing the weights layout as defined in get_layout_transform_fn + transform_output_layout: str + String representing the output layout as defined in get_layout_transform_fn + output_name : str + The name that was given to the output compute and which can be used to get the block name + + Returns + ------- + sch : tvm.tir.Schedule + The STIR schedule for slice conv2d compute + """ + + assert len(ins) == 2, "This schedule expects only 2 inputs - Activations and Weights" + source_expr = ins + [outs] + prim_func = tvm.te.create_prim_func(source_expr) + sch = tvm.tir.Schedule(prim_func) + + compute = sch.get_block(output_name) + # Apply layout_transform for activation + sch.transform_layout(compute, ins[0].name, get_layout_transform_fn(transform_activation_layout)) + + # Apply layout_transform for weights + sch.transform_layout(compute, ins[1].name, get_layout_transform_fn(transform_weights_layout)) + + # Apply layout_transform for output + sch.transform_layout(compute, outs.name, get_layout_transform_fn(transform_output_layout)) + + batch, height, width, channel, reduce_channel, reduce_height, reduce_width = sch.get_loops( + compute + ) # This still returns the original 7d loop + h_outer, h_inner = sch.split(height, [None, 8]) + w_outer, w_inner = sch.split(width, [None, 4]) + w_inner_outer, w_inner_inner = sch.split(w_inner, [2, 2]) + c_outer, c_inner = sch.split(channel, [None, 32]) + sch.reorder( + batch, + h_outer, + w_outer, + c_outer, + h_inner, + w_inner_outer, + reduce_height, + reduce_width, + reduce_channel, + c_inner, + w_inner_inner, + ) + sch.decompose_reduction(compute, reduce_height) + # ci_wii = s.fuse(ci, wii) + # s.vectorize(ci_wii) + return sch diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index 092bce87119ac..58792fc3294fb 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -77,6 +77,18 @@ def nc_1024_2d(n, c): return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024] +def iohw_16i32o2i_1d(height, width, in_channel, out_channel): + return [ + in_channel // 32, + out_channel // 32, + height, + width, + (in_channel % 32) // 2, + out_channel % 32, + in_channel % 2, + ] + + def get_layout_transform_fn(layout): """Return index map function as per the layout string""" if layout == "nhwc-8h2w32c2w-2d": @@ -101,4 +113,6 @@ def get_layout_transform_fn(layout): return nc_512c_2d if layout == "nc-512c-1d": return nc_512c_1d + if layout == "iohw-16i32o2i-1d": + return iohw_16i32o2i_1d raise RuntimeError(f"Unexpected layout '{layout}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py new file mode 100755 index 0000000000000..a03c35cb9e78f --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py @@ -0,0 +1,339 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=line-too-long, redefined-outer-name + +"""Test conv2d slice op for hexagon""" + +import numpy as np + +import tvm +import tvm.testing +from tvm.topi.hexagon.slice_ops.conv2d import conv2d_compute, conv2d_schedule +from tvm.topi.testing import conv2d_nhwc_python + +from ..infrastructure import allocate_hexagon_array, transform_numpy + +input_layout = tvm.testing.parameter( + "nhwc-8h2w32c2w-2d", +) + +output_layout = tvm.testing.parameter( + "nhwc-8h2w32c2w-2d", +) + +weights_layout = tvm.testing.parameter("iohw-16i32o2i-1d") + + +@tvm.testing.fixture +def input_np(in_shape, dtype): + return np.random.uniform(size=in_shape).astype(dtype) + + +@tvm.testing.fixture +def weights_np(filt_shape, dtype): + return (np.random.uniform(size=filt_shape)).astype(dtype) + + +@tvm.testing.fixture +def dilated_filt_shape(filt_shape, dilation): + """Compute the dilated filter shape when dilation > 1""" + filt_height, filt_width, in_channel, out_channel = filt_shape + dilation_height, dilation_width = dilation + if dilation_height == 1 and dilation_width == 1: + return filt_shape + dilated_height, dilated_width = ( + dilation_height * (filt_height - 1) + 1, + dilation_width * (filt_width - 1) + 1, + ) + return dilated_height, dilated_width, in_channel, out_channel + + +@tvm.testing.fixture +def dilated_weights_np(weights_np, dilation, dilated_filt_shape): + """Get dilated weights from original weights for testing""" + filt_height, filt_width, in_channels, out_channels = weights_np.shape + dilation_height, dilation_width = dilation + if dilation_height == 1 and dilation_width == 1: + return weights_np + dilated_height, dilated_width = dilated_filt_shape[0], dilated_filt_shape[1] + dilated_weights = np.zeros(dilated_filt_shape, dtype="float16") + for in_channel in range(in_channels): + for out_channel in range(out_channels): + for dilation_i, height_i in zip( + range(0, dilated_height, dilation_height), range(filt_height) + ): + for dilation_j, width_j in zip( + range(0, dilated_width, dilation_width), range(filt_width) + ): + dilated_weights[dilation_i, dilation_j, in_channel, out_channel] = weights_np[ + height_i, width_j, in_channel, out_channel + ] + + return dilated_weights + + +@tvm.testing.fixture +def input_np_padded(input_np, in_shape, padded_in_shape): + pad_height = padded_in_shape[1] - in_shape[1] + pad_width = padded_in_shape[2] - in_shape[2] + pad_channel = padded_in_shape[3] - in_shape[3] + input_padded = np.pad( + input_np, ((0, 0), (0, pad_height), (0, pad_width), (0, pad_channel)), "constant" + ) + return input_padded + + +@tvm.testing.fixture +def padded_filt_shape(filt_shape): + filt_height, filt_width, in_channels, out_channels = filt_shape + in_channels = ((in_channels + 31) // 32) * 32 + out_channels = ((out_channels + 31) // 32) * 32 + return filt_height, filt_width, in_channels, out_channels + + +@tvm.testing.fixture +def weights_np_padded(weights_np, filt_shape, padded_filt_shape): + pad_in_channels = padded_filt_shape[2] - filt_shape[2] + pad_out_channels = padded_filt_shape[3] - filt_shape[3] + filt_padded = np.pad(weights_np, ((0, 0), (0, 0), (0, pad_in_channels), (0, pad_out_channels))) + return filt_padded + + +@tvm.testing.fixture +def weights_np_transformed(weights_np_padded): + height, width, in_channel, out_channel = weights_np_padded.shape + weights_np_reverse_width = weights_np_padded[:, ::-1, :, :] + transformed_weights_np = weights_np_reverse_width.reshape( + [height, width, in_channel // 32, 16, 2, out_channel // 32, 32] + ).transpose(2, 5, 0, 1, 3, 6, 4) + return transformed_weights_np + + +def generate_test_config(test_params): + """Utility function to generate test config with meaningful ids""" + test_config = {} + + dims = lambda vals: "x".join(map(str, vals)) + + for param in test_params: + in_shape, filt_shape, stride, dilation = param + test_name = f"nhwc{dims(in_shape)}-hwio{dims(filt_shape)}-stride{dims(stride)}-dilation{dims(dilation)}" + test_config[test_name] = param + + return test_config + + +class TestConv2dSlice: + """Test class that defines the conv2d slice test""" + + test_params = [ + [ + (1, 10, 6, 32), + (3, 3, 32, 32), + (1, 1), + (1, 1), + ], + [ + (1, 18, 10, 32), + (3, 3, 32, 32), + (1, 1), + (1, 1), + ], + [ + (1, 10, 6, 64), + (3, 3, 64, 64), + (1, 1), + (1, 1), + ], + [ + (1, 12, 8, 4), + (3, 3, 4, 32), + (1, 1), + (2, 2), + ], + [ + (1, 12, 8, 32), + (5, 5, 32, 32), + (1, 1), + (1, 1), + ], + [ + (1, 16, 12, 32), + (5, 5, 32, 32), + (1, 1), + (2, 2), + ], + [ + (1, 13, 9, 32), + (6, 6, 32, 32), + (1, 1), + (1, 1), + ], + [ + (1, 18, 10, 32), + (3, 3, 32, 32), + (2, 2), + (1, 1), + ], + [ + (1, 20, 12, 32), + (5, 5, 32, 32), + (2, 2), + (1, 1), + ], + [ + (1, 22, 14, 32), + (7, 7, 32, 32), + (2, 2), + (1, 1), + ], + [ + (1, 28, 20, 32), + (7, 7, 32, 32), + (2, 2), + (2, 2), + ], + [ + (1, 10, 4, 4), + (3, 1, 4, 32), + (1, 1), + (1, 1), + ], + [ + (1, 18, 8, 4), + (3, 1, 4, 32), + (2, 2), + (1, 1), + ], + [ + (1, 20, 8, 4), + (3, 1, 4, 32), + (2, 2), + (2, 2), + ], + ] + + test_config = generate_test_config(test_params) + + in_shape, filt_shape, stride, dilation = tvm.testing.parameters( + *test_config.values(), ids=test_config.keys() + ) + dtype = tvm.testing.parameter("float16") + working_scope = tvm.testing.parameter("global.vtcm") + + @tvm.testing.fixture + def padded_in_shape(self, in_shape): + in_batch, in_height, in_width, in_channel = in_shape + in_height = ((in_height + 7) // 8) * 8 + in_width = ((in_width + 3) // 4) * 4 + in_channel = ((in_channel + 31) // 32) * 32 + return in_batch, in_height, in_width, in_channel + + @tvm.testing.fixture + def out_shape(self, in_shape, dilated_filt_shape, stride): + in_batch, in_height, in_width, _ = in_shape + filt_height, filt_width, _, num_filt = dilated_filt_shape + out_height = (in_height - filt_height) // stride[0] + 1 + out_width = (in_width - filt_width) // stride[1] + 1 + out_channel = num_filt + return in_batch, out_height, out_width, out_channel + + @tvm.testing.fixture + def expected_output_np(self, input_np, dilated_weights_np, stride): + ref_np = conv2d_nhwc_python( + input_np.astype("float32"), dilated_weights_np.astype("float32"), stride, padding=0 + ).astype("float16") + return ref_np + + @tvm.testing.requires_hexagon + def test_conv2d( + self, + padded_in_shape, + padded_filt_shape, + stride, + dilation, + dtype, + out_shape, + input_layout, + weights_layout, + output_layout, + input_np_padded, + weights_np_transformed, + expected_output_np, + target, + working_scope, + hexagon_session, + ): + """Main test function that tests the conv2d slice op""" + input_tensor = tvm.te.placeholder(padded_in_shape, name="InputTensor", dtype=dtype) + weights = tvm.te.placeholder(padded_filt_shape, name="Weights", dtype=dtype) + output_name = "output" + + output_tensor = conv2d_compute( + input_tensor, weights, out_shape, stride, dilation, dtype, output_name + ) + + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + + tir_schedule = conv2d_schedule( + output_tensor, + [input_tensor, weights], + input_layout, + weights_layout, + output_layout, + output_name, + ) + + func_name = f"fconv2d_{dtype}" + with tvm.transform.PassContext(opt_level=3): + runtime_module = tvm.build( + tir_schedule.mod, + target=target, + name=func_name, + ) + + input_np_transformed = transform_numpy(input_np_padded, "nhwc", input_layout) + output_np_transformed = transform_numpy(expected_output_np, "nhwc", output_layout) + + input_arr = allocate_hexagon_array( + hexagon_session.device, + data=input_np_transformed, + axis_separators=[4], + mem_scope=working_scope, + ) + + weights_arr = allocate_hexagon_array( + hexagon_session.device, data=weights_np_transformed, mem_scope=working_scope + ) + + output_arr = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=output_np_transformed.shape, + dtype=output_np_transformed.dtype, + axis_separators=[4], + mem_scope=working_scope, + ) + + mod = hexagon_session.load_module(runtime_module) + mod(input_arr, weights_arr, output_arr) + output_np = output_arr.numpy() + np.testing.assert_allclose(output_np, output_np_transformed, atol=1.0, rtol=0.05) + + +if __name__ == "__main__": + tvm.testing.main() From cfe8318990e799215c9baac4ef2c4ecb20a91d9f Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 6 Jul 2022 16:46:29 +0100 Subject: [PATCH 059/111] [microNPU] Calculate memory pressure for microNPU external functions (#11209) * [microNPU] Calculate memory pressure for microNPU external functions During the microNPU compilation stage, the "used_memory" annotations on external microNPU functions are read to determine a memory pressure value. This value is passed to the cascader to better approximate the memory available for the optimization. Change-Id: I11a311b0005e785637014cb451f4aed96edcda26 * fix get size from memory region Change-Id: I41acfc83f05b2204075edb99f86a0eecaba00f71 * add test case for full offload Change-Id: If3e672d402ab237fa82e34761bb972d2e9483ba9 --- .../tvm/contrib/ethosu/cascader/scheduler.py | 8 +- .../relay/backend/contrib/ethosu/codegen.py | 59 ++++-- python/tvm/tir/usmp/utils.py | 9 + .../test_calculate_memory_pressure.py | 186 ++++++++++++++++++ .../cascader/test_memory_reduction.py | 163 ++++++++++++++- 5 files changed, 407 insertions(+), 18 deletions(-) create mode 100644 tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py index fd247e660a8d8..2c804a3b3b645 100644 --- a/python/tvm/contrib/ethosu/cascader/scheduler.py +++ b/python/tvm/contrib/ethosu/cascader/scheduler.py @@ -225,21 +225,21 @@ def choose_proposal( return proposal_choice -def extract_memory_info(memory_pool: PoolInfo) -> MemoryRegion: +def extract_memory_info(memory_pool: PoolInfo, memory_pressure: int) -> MemoryRegion: "Create a MemoryRegion based on the info in the memory pool" - size = int(memory_pool.size_hint_bytes) + size = int(memory_pool.size_hint_bytes - memory_pressure) read_bandwidth = int(memory_pool.read_bandwidth_bytes_per_cycle) write_bandwidth = int(memory_pool.write_bandwidth_bytes_per_cycle) for param in (size, read_bandwidth, write_bandwidth): assert param != -1, f"{param} needs to be specified for the cascader." - name_to_burst_lenght = { + name_to_burst_length = { target.kind.name: burst for target, burst in memory_pool.target_burst_bytes.items() } try: - burst_length = int(name_to_burst_lenght["ethos-u"]) + burst_length = int(name_to_burst_length["ethos-u"]) except KeyError: burst_length = 1 diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py index 423834daa8763..5119c04edba45 100644 --- a/python/tvm/relay/backend/contrib/ethosu/codegen.py +++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py @@ -381,6 +381,46 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable: ) +def _calculate_memory_pressure(mod: tvm.ir.IRModule) -> int: + """ + Calculates a worst-case estimate of the memory consumed at the callsite of + each microNPU function. This value can be used as a hint to guide the cascader, + indicating how aggressively it will need to optimize the input module to fit + into the memory that remains in the memory workspace. + + Parameters + ---------- + mod : tvm.ir.IRModule + The input module + + Returns + ------- + int + Memory pressure value for the module. + """ + memory_pressure = 0 + + @util.create_npu_function_pass(opt_level=1) + class CalculateMemoryPressure: + """ + Traverse the module and get total memory used by external NPU functions. + """ + + def transform_npu_function(self, _, func: relay.Function) -> relay.Function: + nonlocal memory_pressure + max_val = max(func.attrs["used_memory"]) + memory_pressure += max_val + return func + + CalculateMemoryPressure()(mod) # pylint: disable=not-callable + + io_used_memory = 0 + if not tvm.tir.usmp.utils.use_workspace_io_is_enabled(): + io_used_memory = int(mod["main"].attrs["io_used_memory"]) + + return memory_pressure - io_used_memory + + @tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir") def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule: """ @@ -413,21 +453,18 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule: # Use the cascader if it is enabled for the U55 accelerator, otherwise use copy_constants # scheduler if util.is_cascader_enabled(): - assert ( - util.get_accelerator_config() != "ethos-u65-256" - ), "Cascading is not supported for the U65 accelerator" + if util.get_accelerator_config() == "ethos-u65-256": + raise ValueError("Cascading is not supported for the U65 accelerator") workspace_memory_pools = mod.attrs["workspace_memory_pools"] - assert ( - workspace_memory_pools - ), "Workspace memory pool needs to be provided for the U55 cascader" - - assert ( - len(workspace_memory_pools.pools) == 1 - ), "Exactly one workspace pool needs to be provided for the U55 cascader" + if not workspace_memory_pools: + raise ValueError("Workspace memory pool needs to be provided for the U55 cascader") + if len(workspace_memory_pools.pools) != 1: + raise ValueError("Exactly one workspace pool needs to be provided for the U55 cascader") - sram = extract_memory_info(workspace_memory_pools.pools[0]) + memory_pressure = _calculate_memory_pressure(mod) + sram = extract_memory_info(workspace_memory_pools.pools[0], memory_pressure) tir_mod = LowerToTIR(_ethos_u55_cascader(sram, util.is_striping_enabled()))(mod) else: tir_mod = LowerToTIR(copy_constants())(mod) diff --git a/python/tvm/tir/usmp/utils.py b/python/tvm/tir/usmp/utils.py index a7221cfe6f8ee..024922e85b295 100644 --- a/python/tvm/tir/usmp/utils.py +++ b/python/tvm/tir/usmp/utils.py @@ -19,6 +19,7 @@ from typing import Optional, List +import tvm from tvm._ffi import register_object from tvm.runtime import Object from . import _ffi_api @@ -31,6 +32,14 @@ CANDIDATE_MEMORY_POOL_ATTR = "candidate_memory_pools" +def use_workspace_io_is_enabled() -> bool: + """ + Check whether placing I/O tensors in the workspace is enabled. + """ + ctx = tvm.transform.PassContext.current() + return bool(ctx.config.get("tir.usmp.use_workspace_io", False)) + + @register_object("tir.usmp.BufferInfo") class BufferInfo(Object): """BufferInfo object holds information related to buffers diff --git a/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py b/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py new file mode 100644 index 0000000000000..255ec4bba8929 --- /dev/null +++ b/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py @@ -0,0 +1,186 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=wrong-import-position + +""" +Test memory pressure is calculated correctly from used memory annotations. +""" + +import pytest + +pytest.importorskip("ethosu.vela") + +import tvm +from tvm import relay +from tvm.relay.backend.contrib.ethosu.codegen import _calculate_memory_pressure +from tvm.contrib.ethosu.cascader.scheduler import extract_memory_info +from tvm import WorkspacePoolInfo, PoolInfoProperties + + +def _npu_and_non_npu_functions(): + mod = tvm.IRModule({}) + + # NPU function 1 + x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8") + max_pool = relay.nn.max_pool2d(x) + composite_func = relay.Function([x], max_pool) + composite_func = composite_func.with_attr("Composite", "ethos-u.pooling") + inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8") + compiler_func = relay.Function([inp], composite_func) + compiler_func = compiler_func.with_attr("used_memory", [32]) + npu_compiler_func1 = compiler_func.with_attr("Compiler", "ethos-u") + g1 = relay.GlobalVar("g1") + mod[g1] = npu_compiler_func1 + + # Non-NPU function + x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8") + max_pool = relay.abs(x) + composite_func = relay.Function([x], max_pool) + composite_func = composite_func.with_attr("Composite", "foo.unary_elementwise") + inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8") + compiler_func = relay.Function([inp], composite_func) + compiler_func = compiler_func.with_attr("used_memory", [32]) + non_npu_compiler_func = compiler_func.with_attr("Compiler", "foo") + g2 = relay.GlobalVar("g2") + mod[g2] = non_npu_compiler_func + + # NPU function 2 + x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8") + max_pool = relay.abs(x) + composite_func = relay.Function([x], max_pool) + composite_func = composite_func.with_attr("Composite", "ethos-u.unary_elementwise") + inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8") + compiler_func = relay.Function([inp], composite_func) + compiler_func = compiler_func.with_attr("used_memory", [32]) + npu_compiler_func2 = compiler_func.with_attr("Compiler", "ethos-u") + g3 = relay.GlobalVar("g3") + mod[g3] = npu_compiler_func2 + + # Main + inp = relay.var("main_input", shape=(1, 2, 2, 4), dtype="int8") + call1 = relay.Call(g1, [inp]) + call2 = relay.Call(g2, [call1]) + call3 = relay.Call(g3, [call2]) + main_func = relay.Function([inp], call3) + main_func = main_func.with_attr("io_used_memory", 32) + mod["main"] = main_func + return mod + + +def _parallel_npu_functions(): + mod = tvm.IRModule({}) + + # NPU function 1 + x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8") + max_pool = relay.nn.max_pool2d(x) + composite_func = relay.Function([x], max_pool) + composite_func = composite_func.with_attr("Composite", "ethos-u.pooling") + inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8") + compiler_func = relay.Function([inp], composite_func) + compiler_func = compiler_func.with_attr("used_memory", [32]) + npu_compiler_func1 = compiler_func.with_attr("Compiler", "ethos-u") + g1 = relay.GlobalVar("g1") + mod[g1] = npu_compiler_func1 + + # NPU function 2 + x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8") + abs_op = relay.abs(x) + composite_func = relay.Function([x], abs_op) + composite_func = composite_func.with_attr("Composite", "ethos-u.unary_elementwise") + inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8") + compiler_func = relay.Function([inp], composite_func) + compiler_func = compiler_func.with_attr("used_memory", [32 + 16]) + npu_compiler_func2 = compiler_func.with_attr("Compiler", "ethos-u") + g2 = relay.GlobalVar("g2") + mod[g2] = npu_compiler_func2 + + # Main + inp = relay.var("main_input", shape=(1, 2, 2, 4), dtype="int8") + call1 = relay.Call(g1, [inp]) + call2 = relay.Call(g2, [inp]) + concat = relay.concatenate([call1, call2], axis=3) + main_func = relay.Function([inp], concat) + main_func = main_func.with_attr("io_used_memory", 32) + mod["main"] = main_func + return mod + + +def _full_offload(): + mod = tvm.IRModule({}) + + # NPU function + x = relay.var("x", shape=(1, 4, 4, 16), dtype="int8") + max_pool = relay.nn.max_pool2d(x) + composite_func = relay.Function([x], max_pool) + composite_func = composite_func.with_attr("Composite", "ethos-u.pooling") + inp = relay.var("input", shape=(1, 4, 4, 16), dtype="int8") + compiler_func = relay.Function([inp], composite_func) + compiler_func = compiler_func.with_attr("used_memory", [256 + 256]) + npu_compiler_func = compiler_func.with_attr("Compiler", "ethos-u") + g1 = relay.GlobalVar("g1") + mod[g1] = npu_compiler_func + + # Main + inp = relay.var("main_input", shape=(1, 4, 4, 16), dtype="int8") + call = relay.Call(g1, [inp]) + main_func = relay.Function([inp], call) + main_func = main_func.with_attr("io_used_memory", 256 + 256) + mod["main"] = main_func + return mod + + +@pytest.mark.parametrize( + "model_func,use_workspace_io,expected_memory_pressure", + [ + (_npu_and_non_npu_functions, True, (16 + 16) + (16 + 16)), + (_npu_and_non_npu_functions, False, (16 + 16) + (16 + 16) - (16 + 16)), + (_parallel_npu_functions, True, (16 + 16) + (16 + 16 + 16)), + (_parallel_npu_functions, False, (16 + 16) + (16 + 16 + 16) - (16 + 16)), + (_full_offload, True, (256 + 256)), + (_full_offload, False, (256 + 256) - (256 + 256)), + ], +) +def test_calculate_memory_pressure_pass(model_func, use_workspace_io, expected_memory_pressure): + """ + Test that memory pressure is correctly calculated for NPU external functions. + """ + + mod = model_func() + with tvm.transform.PassContext(config={"tir.usmp.use_workspace_io": use_workspace_io}): + memory_pressure = _calculate_memory_pressure(mod) + assert memory_pressure == expected_memory_pressure + + +def test_extract_memory_info(): + """ + Test memory pressure value correctly reduces the workspace size. + """ + initial_pool_size = 2000 + memory_pressure = 500 + memory_pool = WorkspacePoolInfo( + "SRAM", + [tvm.target.Target("c"), tvm.target.Target("ethos-u")], + PoolInfoProperties( + size_hint_bytes=initial_pool_size, + read_bandwidth_bytes_per_cycle=16, + write_bandwidth_bytes_per_cycle=16, + target_burst_bytes={tvm.target.Target("ethos-u"): 1}, + ), + ) + + sram = extract_memory_info(memory_pool, memory_pressure) + assert sram.size == initial_pool_size - memory_pressure diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py index 5c3b745cb423a..e88282240510e 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py +++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py @@ -28,13 +28,12 @@ from tvm.relay.op.contrib.ethosu import partition_for_ethosu import tvm from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties +from tvm.relay.backend.contrib.ethosu.codegen import extract_memory_info from .. import infra -def _get_ethosu_workspace_size( - mod, params, accel_type, pool_size, enable_cascader, enable_striping -): +def _get_compilation_config(accel_type, enable_cascader, enable_striping): enable_usmp = True target = tvm.target.Target("c") @@ -61,6 +60,17 @@ def _get_ethosu_workspace_size( "tir.disable_storage_rewrite": enable_usmp, } + return target, ethosu_target, runtime, executor, pass_config + + +def _get_ethosu_workspace_size( + mod, params, accel_type, pool_size, enable_cascader, enable_striping +): + + target, ethosu_target, runtime, executor, pass_config = _get_compilation_config( + accel_type, enable_cascader, enable_striping + ) + workspace_memory_pools = WorkspaceMemoryPools( [ WorkspacePoolInfo( @@ -234,3 +244,150 @@ def tf_graph(x): assert workspace_size_cascader_disabled == expected_ws_size_without_striping assert workspace_size_cascader_enabled_striping_enabled == expected_ws_size_with_striping + + +def test_multiple_memory_pools(): + """ + The cascader does not support multiple workspace memory + pools. Check the correct error is thrown. + """ + np.random.seed(2) + ifm_shape = (1, 80, 75, 3) + + target, ethosu_target, runtime, executor, pass_config = _get_compilation_config( + "ethos-u55-256", True, True + ) + workspace_memory_pools = WorkspaceMemoryPools( + [ + WorkspacePoolInfo( + "SRAM", + [target, ethosu_target], + PoolInfoProperties( + size_hint_bytes=1, + read_bandwidth_bytes_per_cycle=16, + write_bandwidth_bytes_per_cycle=16, + target_burst_bytes={ethosu_target: 1}, + ), + ), + WorkspacePoolInfo( + "SRAM", + [target, ethosu_target], + PoolInfoProperties( + size_hint_bytes=1, + read_bandwidth_bytes_per_cycle=16, + write_bandwidth_bytes_per_cycle=16, + target_burst_bytes={ethosu_target: 1}, + ), + ), + ] + ) + + @tf.function + def tf_graph(x): + return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME") + + _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape]) + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0) + relay_module, params = relay.frontend.from_tflite(tflite_model) + mod = partition_for_ethosu(relay_module, params) + + with pytest.raises(ValueError) as e: + with tvm.transform.PassContext(opt_level=3, config=pass_config): + tvm.relay.build( + mod, + target, + executor=executor, + runtime=runtime, + workspace_memory_pools=workspace_memory_pools, + params=params, + ) + + expected_reason = "Exactly one workspace pool needs to be provided for the U55 cascader" + on_error = "A ValueError was caught but its reason is not the expected one." + assert expected_reason in str(e.value), on_error + + +def test_missing_memory_pools(): + """ + The cascader requires memory pools to be present, check the correct error + is thrown when there aren't any. + """ + np.random.seed(2) + ifm_shape = (1, 80, 75, 3) + + target, _, runtime, executor, pass_config = _get_compilation_config("ethos-u55-256", True, True) + + @tf.function + def tf_graph(x): + return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME") + + _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape]) + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0) + relay_module, params = relay.frontend.from_tflite(tflite_model) + mod = partition_for_ethosu(relay_module, params) + + with pytest.raises(ValueError) as e: + with tvm.transform.PassContext(opt_level=3, config=pass_config): + tvm.relay.build( + mod, + target, + executor=executor, + runtime=runtime, + workspace_memory_pools=None, + params=params, + ) + + expected_reason = "Workspace memory pool needs to be provided for the U55 cascader" + on_error = "A ValueError was caught but its reason is not the expected one." + assert expected_reason in str(e.value), on_error + + +def test_invalid_accelerator(): + """ + Check an error is thrown when an unsupported accelerator configuration + is used. + """ + np.random.seed(2) + ifm_shape = (1, 80, 75, 3) + + target, ethosu_target, runtime, executor, pass_config = _get_compilation_config( + "ethos-u65-256", True, True + ) + workspace_memory_pools = WorkspaceMemoryPools( + [ + WorkspacePoolInfo( + "SRAM", + [target, ethosu_target], + PoolInfoProperties( + size_hint_bytes=1, + read_bandwidth_bytes_per_cycle=16, + write_bandwidth_bytes_per_cycle=16, + target_burst_bytes={ethosu_target: 1}, + ), + ), + ] + ) + + @tf.function + def tf_graph(x): + return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME") + + _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape]) + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0) + relay_module, params = relay.frontend.from_tflite(tflite_model) + mod = partition_for_ethosu(relay_module, params) + + with pytest.raises(ValueError) as e: + with tvm.transform.PassContext(opt_level=3, config=pass_config): + tvm.relay.build( + mod, + target, + executor=executor, + runtime=runtime, + workspace_memory_pools=workspace_memory_pools, + params=params, + ) + + expected_reason = "Cascading is not supported for the U65 accelerator" + on_error = "A ValueError was caught but its reason is not the expected one." + assert expected_reason in str(e.value), on_error From 1392e64e0bd9f55238256f5feb95eb2af90b6b97 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 6 Jul 2022 22:08:23 -0700 Subject: [PATCH 060/111] [Arith] Allow constant values in InverseAffineIterMap (#12026) --- src/arith/iter_affine_map.cc | 4 +++- tests/python/unittest/test_arith_iter_affine_map.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc index e1d6d316b4233..d2aa16ded1f6f 100644 --- a/src/arith/iter_affine_map.cc +++ b/src/arith/iter_affine_map.cc @@ -2163,7 +2163,9 @@ class InverseAffineIterMapTransformer { * descending order of lower_factor. */ void CheckFusePattern(const IterSumExpr sum_expr) { - ICHECK(sum_expr->args.size()); + if (sum_expr->args.empty()) { + return; + } PrimExpr expected_scale = sum_expr->args.back()->scale; for (size_t i = sum_expr->args.size(); i > 0; i--) { ICHECK(analyzer_->CanProveEqual(sum_expr->args[i - 1]->scale, expected_scale)); diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py index 472ecac44f1b0..7bc5ead2984ac 100644 --- a/tests/python/unittest/test_arith_iter_affine_map.py +++ b/tests/python/unittest/test_arith_iter_affine_map.py @@ -869,6 +869,19 @@ def test_inverse_affine_iter_map(): assert analyzer.can_prove_equal(res[l0[0]], l0_inverse) +def test_inverse_affine_map_trivial_iter(): + analyzer = tvm.arith.Analyzer() + l0 = create_iter("l0", 64) + l1 = create_iter("l1", 64) + iter_map = tvm.arith.detect_iter_map([0, l0[0], l1[0]], var_dom([l0, l1])).indices + outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))] + res = tvm.arith.inverse_affine_iter_map(iter_map, outputs) + # output_0 is expected to be constant and it is not included in the inverse map + assert len(res) == 2 + assert analyzer.can_prove_equal(res[l0[0]], outputs[1]) + assert analyzer.can_prove_equal(res[l1[0]], outputs[2]) + + def test_free_variables(): x = tvm.tir.Var("x", "int32") y = tvm.tir.Var("y", "int32") From 9f4bf38b5766609317e9a52bc60d66679ceddf02 Mon Sep 17 00:00:00 2001 From: Lite Ye Date: Thu, 7 Jul 2022 01:11:10 -0400 Subject: [PATCH 061/111] [TVMScript] Doc Base Class & DocPrinter Scaffolding (#11971) This PR addes: - Doc base class - DocPrinter base class - PythonDocPrinter - LiteralDoc and its support in DocPrinter Tracking issue: #11912 --- CMakeLists.txt | 1 + include/tvm/script/printer/doc.h | 165 ++++++++++++++++++ include/tvm/script/printer/doc_printer.h | 43 +++++ python/tvm/script/printer/__init__.py | 26 +++ python/tvm/script/printer/_ffi_api.py | 20 +++ python/tvm/script/printer/doc.py | 49 ++++++ python/tvm/script/printer/doc_printer.py | 39 +++++ src/script/printer/base_doc_printer.cc | 49 ++++++ src/script/printer/base_doc_printer.h | 131 ++++++++++++++ src/script/printer/doc.cc | 43 +++++ src/script/printer/python_doc_printer.cc | 70 ++++++++ .../unittest/test_tvmscript_printer_doc.py | 33 ++++ ...st_tvmscript_printer_python_doc_printer.py | 53 ++++++ tests/scripts/task_mypy.sh | 3 + 14 files changed, 725 insertions(+) create mode 100644 include/tvm/script/printer/doc.h create mode 100644 include/tvm/script/printer/doc_printer.h create mode 100644 python/tvm/script/printer/__init__.py create mode 100644 python/tvm/script/printer/_ffi_api.py create mode 100644 python/tvm/script/printer/doc.py create mode 100644 python/tvm/script/printer/doc_printer.py create mode 100644 src/script/printer/base_doc_printer.cc create mode 100644 src/script/printer/base_doc_printer.h create mode 100644 src/script/printer/doc.cc create mode 100644 src/script/printer/python_doc_printer.cc create mode 100644 tests/python/unittest/test_tvmscript_printer_doc.py create mode 100644 tests/python/unittest/test_tvmscript_printer_python_doc_printer.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 306a8be308584..46de8f5d07fa0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -281,6 +281,7 @@ tvm_file_glob(GLOB_RECURSE COMPILER_SRCS src/parser/*.cc src/printer/*.cc src/support/*.cc + src/script/*.cc ) tvm_file_glob(GLOB CODEGEN_SRCS diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h new file mode 100644 index 0000000000000..67c27bd45a1d8 --- /dev/null +++ b/include/tvm/script/printer/doc.h @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_PRINTER_DOC_H_ +#define TVM_SCRIPT_PRINTER_DOC_H_ + +#include +#include +#include + +namespace tvm { +namespace script { +namespace printer { + +/*! + * \brief The base class of all Doc. + * + * Doc is an intermediate representation between IR from TVM + * and the TVMScript code. + * During printing, IR graph is first translated into Doc tree, + * then the Doc tree is translated to the target language in + * text format. + * + * \sa Doc + */ +class DocNode : public Object { + public: + void VisitAttrs(AttrVisitor* v) {} + + static constexpr const char* _type_key = "script.printer.Doc"; + TVM_DECLARE_BASE_OBJECT_INFO(DocNode, Object); + + public: + virtual ~DocNode() = default; +}; + +/*! + * \brief Reference type of DocNode. + * + * \sa DocNode + */ +class Doc : public ObjectRef { + protected: + Doc() = default; + + public: + virtual ~Doc() = default; + TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Doc, ObjectRef, DocNode); +}; + +/*! + * \brief The base class of expression doc. + * + * \sa ExprDoc + */ +class ExprDocNode : public DocNode { + public: + void VisitAttrs(AttrVisitor* v) { DocNode::VisitAttrs(v); } + + static constexpr const char* _type_key = "script.printer.ExprDoc"; + TVM_DECLARE_BASE_OBJECT_INFO(ExprDocNode, DocNode); +}; + +/*! + * \brief Reference type of ExprDocNode. + * + * \sa ExprDocNode + */ +class ExprDoc : public Doc { + protected: + ExprDoc() = default; + + public: + TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ExprDoc, Doc, ExprDocNode); +}; + +/*! + * \brief Doc that represents literal value. + * + * \sa LiteralDoc + */ +class LiteralDocNode : public ExprDocNode { + public: + /*! + * \brief the internal representation of the literal value. + * + * Possible actual types: + * - IntImm (integer or boolean) + * - FloatImm + * - String + * - null + */ + ObjectRef value; + + void VisitAttrs(AttrVisitor* v) { + ExprDocNode::VisitAttrs(v); + v->Visit("value", &value); + } + + static constexpr const char* _type_key = "script.printer.LiteralDoc"; + TVM_DECLARE_FINAL_OBJECT_INFO(LiteralDocNode, ExprDocNode); +}; + +/*! + * \brief Reference type of LiteralDocNode. + * + * \sa LiteralDocNode + */ +class LiteralDoc : public ExprDoc { + protected: + explicit LiteralDoc(ObjectRef value); + + public: + /*! + * \brief Create a LiteralDoc to represent None/null/empty value. + */ + static LiteralDoc None() { return LiteralDoc(ObjectRef(nullptr)); } + + /*! + * \brief Create a LiteralDoc to represent integer. + * \param v The integer value. + */ + static LiteralDoc Int(int v) { return LiteralDoc(IntImm(DataType::Int(64), v)); } + + /*! + * \brief Create a LiteralDoc to represent boolean. + * \param v The boolean value. + */ + static LiteralDoc Boolean(bool v) { return LiteralDoc(IntImm(DataType::Bool(), v)); } + + /*! + * \brief Create a LiteralDoc to represent float. + * \param v The float value. + */ + static LiteralDoc Float(double v) { return LiteralDoc(FloatImm(DataType::Float(64), v)); } + + /*! + * \brief Create a LiteralDoc to represent string. + * \param v The string value. + */ + static LiteralDoc Str(const String& v) { return LiteralDoc(v); } + + TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(LiteralDoc, ExprDoc, LiteralDocNode); +}; + +} // namespace printer +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_PRINTER_DOC_H_ diff --git a/include/tvm/script/printer/doc_printer.h b/include/tvm/script/printer/doc_printer.h new file mode 100644 index 0000000000000..6bf502fab910c --- /dev/null +++ b/include/tvm/script/printer/doc_printer.h @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_PRINTER_DOC_PRINTER_H_ +#define TVM_SCRIPT_PRINTER_DOC_PRINTER_H_ + +#include + +namespace tvm { +namespace script { +namespace printer { + +/*! + * \brief Convert Doc into Python script. + * + * This function unpacks the DocPrinterOptions into function arguments + * to be FFI friendly. + * + * \param doc the doc to be converted + * \param indent_spaces the number of spaces used for indention + */ +String DocToPythonScript(Doc doc, int indent_spaces = 4); + +} // namespace printer +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_PRINTER_DOC_PRINTER_H_ diff --git a/python/tvm/script/printer/__init__.py b/python/tvm/script/printer/__init__.py new file mode 100644 index 0000000000000..84ab7b0ba8369 --- /dev/null +++ b/python/tvm/script/printer/__init__.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +TVMScript Unified Printer + +This package provides a set of APIs to print supported TVM IR into TVMScript +in a roundtrippable way. + +https://github.com/apache/tvm-rfcs/blob/main/rfcs/0074-tvmscript-unified-printer.md +""" + +from . import _ffi_api diff --git a/python/tvm/script/printer/_ffi_api.py b/python/tvm/script/printer/_ffi_api.py new file mode 100644 index 0000000000000..baa639fe2d679 --- /dev/null +++ b/python/tvm/script/printer/_ffi_api.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""FFI APIs for tvm.script.printer""" +import tvm._ffi + +tvm._ffi._init_api("script.printer", __name__) diff --git a/python/tvm/script/printer/doc.py b/python/tvm/script/printer/doc.py new file mode 100644 index 0000000000000..f6179d7351b2c --- /dev/null +++ b/python/tvm/script/printer/doc.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Doc types for TVMScript Unified Printer""" + +import tvm._ffi +from tvm.runtime import Object + +from . import _ffi_api + + +class Doc(Object): + """Base class of all Docs""" + + +class ExprDoc(Object): + """Base class of all expression Docs""" + + +@tvm._ffi.register_object("script.printer.LiteralDoc") +class LiteralDoc(ExprDoc): + """Doc that represents literal value""" + + def __init__(self, value): + if value is None: + self.__init_handle_by_constructor__(_ffi_api.LiteralDocNone) # type: ignore + elif isinstance(value, str): + self.__init_handle_by_constructor__(_ffi_api.LiteralDocStr, value) # type: ignore + elif isinstance(value, float): + self.__init_handle_by_constructor__(_ffi_api.LiteralDocFloat, value) # type: ignore + elif isinstance(value, bool): + self.__init_handle_by_constructor__(_ffi_api.LiteralDocBoolean, value) # type: ignore + elif isinstance(value, int): + self.__init_handle_by_constructor__(_ffi_api.LiteralDocInt, value) # type: ignore + else: + raise TypeError(f"Unsupported type {type(value)} for LiteralDoc") diff --git a/python/tvm/script/printer/doc_printer.py b/python/tvm/script/printer/doc_printer.py new file mode 100644 index 0000000000000..404632b44c07b --- /dev/null +++ b/python/tvm/script/printer/doc_printer.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Functions to print doc into text format""" + +from . import _ffi_api +from .doc import Doc + + +def to_python_script(doc: Doc, indent_spaces: int = 4) -> str: + """ + Convert Doc into Python script. + + Parameters + ---------- + doc : Doc + The doc to convert into Python script + indent_spaces : int + The number of indent spaces to use in the output + + Returns + ------- + script : str + The text representation of Doc in Python syntax + """ + return _ffi_api.DocToPythonScript(doc, indent_spaces) # type: ignore diff --git a/src/script/printer/base_doc_printer.cc b/src/script/printer/base_doc_printer.cc new file mode 100644 index 0000000000000..f6874ba1a2ee3 --- /dev/null +++ b/src/script/printer/base_doc_printer.cc @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "./base_doc_printer.h" + +namespace tvm { +namespace script { +namespace printer { + +DocPrinter::DocPrinter(int indent_spaces) : indent_spaces_(indent_spaces) {} + +void DocPrinter::Append(const Doc& doc) { PrintDoc(doc); } + +String DocPrinter::GetString() const { + std::string text = output_.str(); + if (!text.empty() && text.back() != '\n') { + text.push_back('\n'); + } + return text; +} + +void DocPrinter::PrintDoc(const Doc& doc) { + if (const auto* doc_node = doc.as()) { + PrintTypedDoc(GetRef(doc_node)); + } else { + LOG(FATAL) << "Do not know how to print " << doc->GetTypeKey(); + throw; + } +} + +} // namespace printer +} // namespace script +} // namespace tvm diff --git a/src/script/printer/base_doc_printer.h b/src/script/printer/base_doc_printer.h new file mode 100644 index 0000000000000..128fcef2ea326 --- /dev/null +++ b/src/script/printer/base_doc_printer.h @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_ +#define TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_ + +#include +#include + +#include +#include +#include + +namespace tvm { +namespace script { +namespace printer { + +/*! + * \brief DocPrinter is responsible for printing Doc tree into text format + * \details This is the base class for translating Doc into string. + * Each target language needs to have its subclass of DocPrinter + * to define the actual logic of printing Doc. + * + * \sa Doc + */ +class DocPrinter { + public: + /*! + * \brief The constructor of DocPrinter + * + * \param options the option for printer + */ + explicit DocPrinter(int indent_spaces = 4); + virtual ~DocPrinter() = default; + + /*! + * \brief Append a doc into the final content + * + * \param doc the Doc to be printed + * + * \sa GetString + */ + void Append(const Doc& doc); + + /*! + * \brief Get the printed string of all Doc appended + * + * The content of each Doc in the returned string will + * appear in the same order as they are appended. + * + * \sa Append + */ + String GetString() const; + + protected: + /*! + * \brief Get the printed string + * + * It will dispatch to the PrintTypedDoc method based on + * the actual type of Doc. + * + * \sa PrintTypedDoc + */ + void PrintDoc(const Doc& doc); + + /*! + * \brief Virtual method to print a LiteralDoc + */ + virtual void PrintTypedDoc(const LiteralDoc& doc) = 0; + + /*! + * \brief Increase the indent level of any content to be + * printed after this call + */ + void IncreaseIndent() { indent_ += indent_spaces_; } + + /*! + * \brief Decrease the indent level of any content to be + * printed after this call + */ + void DecreaseIndent() { indent_ -= indent_spaces_; } + + /*! + * \brief Add a new line into the output stream + * + * \sa output_ + */ + std::ostream& NewLine() { + output_ << "\n"; + output_ << std::string(indent_, ' '); + return output_; + } + + /*! + * \brief The output stream of printer + * + * All printed content will be stored in this stream and returned + * when GetString is called. + * + * \sa GetString + */ + std::ostringstream output_; + + private: + /*! \brief the number of spaces for one level of indentation */ + int indent_spaces_ = 4; + + /*! \brief the current level of indent */ + int indent_ = 0; +}; + +} // namespace printer +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_ diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc new file mode 100644 index 0000000000000..e54adbd36b4c2 --- /dev/null +++ b/src/script/printer/doc.cc @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include + +namespace tvm { +namespace script { +namespace printer { + +LiteralDoc::LiteralDoc(ObjectRef value) { + ObjectPtr n = make_object(); + n->value = value; + this->data_ = std::move(n); +} + +TVM_REGISTER_NODE_TYPE(DocNode); +TVM_REGISTER_NODE_TYPE(ExprDocNode); +TVM_REGISTER_NODE_TYPE(LiteralDocNode); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed(LiteralDoc::None); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt").set_body_typed(LiteralDoc::Int); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean").set_body_typed(LiteralDoc::Boolean); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat").set_body_typed(LiteralDoc::Float); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr").set_body_typed(LiteralDoc::Str); + +} // namespace printer +} // namespace script +} // namespace tvm diff --git a/src/script/printer/python_doc_printer.cc b/src/script/printer/python_doc_printer.cc new file mode 100644 index 0000000000000..cd816e4f70106 --- /dev/null +++ b/src/script/printer/python_doc_printer.cc @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "../../support/str_escape.h" +#include "./base_doc_printer.h" + +namespace tvm { +namespace script { +namespace printer { + +class PythonDocPrinter : public DocPrinter { + public: + explicit PythonDocPrinter(int indent_spaces = 4) : DocPrinter(indent_spaces) {} + + protected: + using DocPrinter::PrintDoc; + + void PrintTypedDoc(const LiteralDoc& doc) final; +}; + +void PythonDocPrinter::PrintTypedDoc(const LiteralDoc& doc) { + const ObjectRef& value = doc->value; + if (!value.defined()) { + output_ << "None"; + } else if (const auto* int_imm = value.as()) { + if (int_imm->dtype.is_bool()) { + output_ << (int_imm->value ? "True" : "False"); + } else { + output_ << int_imm->value; + } + } else if (const auto* float_imm = value.as()) { + // TODO(yelite): Make float number printing roundtrippable + output_.precision(17); + output_ << float_imm->value; + } else if (const auto* string_obj = value.as()) { + output_ << "\"" << support::StrEscape(string_obj->data, string_obj->size) << "\""; + } else { + LOG(FATAL) << "TypeError: Unsupported literal value type: " << value->GetTypeKey(); + } +} + +String DocToPythonScript(Doc doc, int indent_spaces) { + PythonDocPrinter printer(indent_spaces); + printer.Append(doc); + return printer.GetString(); +} + +TVM_REGISTER_GLOBAL("script.printer.DocToPythonScript").set_body_typed(DocToPythonScript); + +} // namespace printer +} // namespace script +} // namespace tvm diff --git a/tests/python/unittest/test_tvmscript_printer_doc.py b/tests/python/unittest/test_tvmscript_printer_doc.py new file mode 100644 index 0000000000000..6330d33bf25ad --- /dev/null +++ b/tests/python/unittest/test_tvmscript_printer_doc.py @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +from tvm.tir import IntImm +from tvm.script.printer.doc import LiteralDoc + + +@pytest.mark.parametrize( + "value", + [None, "test", 0, 1, -2, 0.0, 1.5, -1.3, True, False], +) +def test_literal_doc_construction(value): + doc = LiteralDoc(value) + if isinstance(value, float): + # FloatImm cannot be compared with Python's float directly + assert float(doc.value) == pytest.approx(value) + else: + assert doc.value == value diff --git a/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py new file mode 100644 index 0000000000000..55b5e88c88c88 --- /dev/null +++ b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +from tvm.script.printer.doc_printer import to_python_script +from tvm.script.printer.doc import LiteralDoc + + +def format_script(s: str) -> str: + """ + Remove leading and trailing blank lines, and make the minimum idention 0 + """ + s = s.strip("\n") + non_empty_lines = [line for line in s.splitlines() if line and not line.isspace()] + line_indents = [len(line) - len(line.lstrip(" ")) for line in non_empty_lines] + spaces_to_remove = min(line_indents) + return "\n".join(line[spaces_to_remove:] for line in s.splitlines()) + + +@pytest.mark.parametrize( + "doc,expected", + [ + (LiteralDoc(None), "None"), + (LiteralDoc(True), "True"), + (LiteralDoc(False), "False"), + (LiteralDoc("test"), '"test"'), + (LiteralDoc(""), '""'), + (LiteralDoc('""'), r'"\"\""'), + (LiteralDoc("\n\t\\test\r"), r'"\n\t\\test\r"'), + # TODO: fix the roundatrippable problem caused by utf8 + pytest.param(LiteralDoc("\x88"), r'"\x88"', marks=pytest.mark.xfail), + (LiteralDoc(0), "0"), + (LiteralDoc(-1), "-1"), + (LiteralDoc(3.25), "3.25"), + (LiteralDoc(-0.5), "-0.5"), + ], +) +def test_print_literal_doc(doc, expected): + assert to_python_script(doc).rstrip("\n") == format_script(expected) diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh index 1ef7db5894322..f165adfe1bc45 100755 --- a/tests/scripts/task_mypy.sh +++ b/tests/scripts/task_mypy.sh @@ -32,6 +32,9 @@ mypy --check-untyped-defs python/tvm/tir/analysis/ echo "Checking MyPy Type defs in the transform package." mypy --check-untyped-defs python/tvm/tir/transform/ +echo "Checking MyPy Type defs in the tvmscript printer package." +mypy --check-untyped-defs python/tvm/script/printer + echo "Checking MyPy Type defs in the TIR package with unittest" MYPYPATH=$TVM_PATH/python mypy --check-untyped-defs tests/python/unittest/test_tvmscript_type.py From 40d242a3c8f9630223e5775c1f1bf23362c8850e Mon Sep 17 00:00:00 2001 From: yuanfz <42092999+yuanfz98@users.noreply.github.com> Date: Thu, 7 Jul 2022 08:37:48 +0200 Subject: [PATCH 062/111] [Pytorch] add aten::rnn_tanh, aten::rnn_relu (#12017) * emptycommit 2nd try * dev * comments * format * format Co-authored-by: yuanfz <42092999+FZYUAN-1@users.noreply.github.com> --- python/tvm/relay/frontend/common.py | 40 +++++ python/tvm/relay/frontend/pytorch.py | 189 ++++++++++++++++++++- tests/python/frontend/pytorch/test_rnns.py | 79 +++++++++ 3 files changed, 307 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py index 7a1e98402996e..5f961f1ae0e88 100755 --- a/python/tvm/relay/frontend/common.py +++ b/python/tvm/relay/frontend/common.py @@ -686,6 +686,46 @@ def unbind(data, axis=0): return _expr.TupleWrapper(_expr.Tuple(ret), selections) +def rnn_cell( + input_seqs, hidden_state, w_inp, w_hid, b_inp=None, b_hid=None, backwards=False, act=_op.tanh +): + """ + Common implementation of RNN cell for all frontends of TVM + + Parameters + ---------- + input_seqs : List[relay.Expr] + The sequence of input tensors + Input tensor should be 2d while issue #8412 is not resolved + Shape = (batch, feature_size) + hidden_state : relay.Expr + Hidden state. shape = (batch_size, hidden_size) + w_inp, w_hid: relay.Expr + weight matrices. shape = (hidden_size, feature_size), (hidden_size, feature_size) + b_inp, b_hid : relay.Expr + bias matrices. The same order of internal parts as for weights. shape = (1 * hidden_size) + backwards : bool + Flag for reverse pass of RNN + act : relay.op + activation function. It is tanh by default. + + Returns + ------- + result : List[relay.Expr], relay.Expr, relay.Expr + The sequence of computed result, final hidden and cell state + """ + outputs_list = [] + for x_t in input_seqs if not backwards else reversed(input_seqs): + xwt = _op.nn.dense(x_t, w_inp) + hwt = _op.nn.dense(hidden_state, w_hid) + if b_inp is not None and b_hid is not None: + xwt += b_inp + hwt += b_hid + hidden_state = act(xwt + hwt) + outputs_list.append(hidden_state) # [seq_num, (batch, hidden_size)] + return outputs_list, hidden_state + + def gru_cell( input_seqs, hidden_state, diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index b1a760886037d..d7e1a5dd1ddb7 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -40,7 +40,7 @@ from ..prelude import Prelude, StaticTensorArrayOps from ..ty import Any, TensorType, TupleType from . import qnn_torch -from .common import AttrCvt, get_relay_op, gru_cell, logger +from .common import AttrCvt, get_relay_op, gru_cell, logger, rnn_cell from .common import infer_shape as _infer_shape from .common import infer_value as _infer_value from .common import infer_value_simulated as _infer_value_simulated @@ -2630,6 +2630,191 @@ def flip(self, inputs, input_types): axis = inputs[1] return _op.transform.reverse(data, axis=axis[0]) + def bidir_rnn_cell(self, input_seqs, weights_dicts, act=_op.tanh): + """ + Bidirectional RNN cell + """ + seq_len = len(input_seqs) + forward_outputs, fw_H_t = rnn_cell(input_seqs, **weights_dicts[0], backwards=False, act=act) + + reverse_outputs, rev_H_t = rnn_cell(input_seqs, **weights_dicts[1], backwards=True, act=act) + + final_outputs = [] + for i in range(seq_len): + final_outputs.append( + _op.concatenate([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=-1) + ) + + return final_outputs, _op.stack([fw_H_t, rev_H_t], axis=0) + + def rnn_layers(self, input_data, layer_weights_dicts, bidirectional, act, dropout_p=0.0): + """ + Methods iterates layers for Stacked RNN + """ + layers_num = len(layer_weights_dicts) + # split input sequence to samples set + input_seqs = unbind(input_data, 0) # [seq_num, (batch, feature_size)] + output_hiddens = [] + for i in range(layers_num): + weights_dicts = layer_weights_dicts[i] + # input_seqs shape = [seq_num, (batch, feature_size)] or + # [seq_num, (batch, 2*feature_size)] for bidirectional + if bidirectional: + input_seqs, H_t = self.bidir_rnn_cell(input_seqs, weights_dicts, act=act) + else: + input_seqs, H_t = rnn_cell(input_seqs, **weights_dicts[0], act=act) + + output_hiddens.append(H_t) + + # TODO (yuanfz98): in pytorch implementation train is also checked + # see https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339 + # /aten/src/ATen/native/RNN.cpp#L1054 + if dropout_p != 0 and i < layers_num - 1: + # for input in input_seqs: + # input = _op.dropout(input, dropout_p) + raise NotImplementedError("Dropout for GRU has not been supported yet!") + output_hiddens = ( + _op.concatenate(output_hiddens, 0) if bidirectional else _op.stack(output_hiddens, 0) + ) + return _op.stack(input_seqs, 0), output_hiddens + + def rnn(self, inputs, input_types, nonlinearity): + """ + Description of RNN in pytorch: + https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN + Description of inputs: + https://github.com/pytorch/pytorch/blob/736fb7d22cc948b739db2c35aeb5ad4d19aea4f4/torch/overrides.py#L937 + """ + # TODO (yuanfz98): support dropout + assert len(inputs) == 9, "Input of size 9 is expected" + # Unpack inputs, note that if optional and not provided then value will be None. + _X = inputs[0] + # _X shape (seq_num, batch, feature_size) or (batch, seq_num, feature_size) + + hidden_state = inputs[1] + # Hidden state shape (hidden_layers_num, batch, hidden_size) + + _weights = inputs[2] + # Wi layer[0] shape (hidden_size, feature_size) + # Wh layer[0] shape (hidden_size, hidden_size) + # Bi layer[0] shape (hidden_size) + # Bh layer[0] shape (hidden_size) + + # Wi layer[>0] shape (hidden_size, hidden_size * num_directions) + # Wh layer[>0] shape (hidden_size, hidden_size) + # Bi layer[>0] shape (hidden_size) + # Bh layer[>0] shape (hidden_size) + + # Scalar inputs + has_biases = inputs[3] + num_layers = inputs[4] + dropout_p = inputs[5] # dropout probability, if 0.0 it means there is no dropout + # train = inputs[6] + bidirectional = inputs[7] + batch_first = inputs[8] + + num_directions = 1 + if bidirectional: + num_directions = 2 + + rsd = len(_weights) % num_layers + assert rsd == 0, "The number of weights must be a multiple of the number of layers!" + rsd = (len(_weights) / num_layers) % num_directions + assert ( + rsd == 0 + ), "The number of weights in layer must be a multiple of the number of directions!" + + weights_num = int(len(_weights) / num_layers / num_directions) + if has_biases: + assert weights_num == 4, "The weights number in layer is expected equal to 4" + else: + assert weights_num == 2, "The weights number in layer is expected equal to 2" + if nonlinearity == "tanh": + act = _op.tanh + elif nonlinearity == "relu": + act = _op.nn.relu + assert act, "The nonlinearity is unknown" + X = ( + _op.transpose(_X, (1, 0, 2)) if batch_first else _X + ) # always (seq_num, batch, feature_size) + # TODO (yuanfz98): Which data type should be used? from input or weights? + # Instead of it _infer_type(X).checked_type.dtype can be used + X_dtype = input_types[0] + X_shape = _infer_shape(X) # (seq_num, batch, feature_size) + + hidden_size = int(_infer_shape(_weights[0])[0]) + batch_size = X_shape[1] + + # Initialize hidden states if not provided. + layers_h = [] + hidden_layers_num = num_directions * num_layers + if hidden_state is None: + h_0 = _op.zeros((batch_size, hidden_size), X_dtype) + for i in range(hidden_layers_num): + layers_h.append(h_0) + else: + layers_h = unbind(hidden_state, 0) + + layer_weights_dicts = [] + k = 0 # layer counter + if has_biases: + names = ["hidden_state", "w_inp", "w_hid", "b_inp", "b_hid"] + if bidirectional: + rsd = len(_weights) % (2 * weights_num) + assert rsd == 0, "got an incorrect number of RNN weights" + for i in range(0, len(_weights), 2 * weights_num): + fw_tensors = [layers_h[2 * k], *_weights[i : i + 4]] + fw_weights_dict = dict(zip(names, fw_tensors)) + j = i + weights_num + rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 4]] + rev_weights_dict = dict(zip(names, rev_tensors)) + layer_weights_dicts.append([fw_weights_dict, rev_weights_dict]) + k += 1 + else: + assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights" + for i in range(0, len(_weights), weights_num): + fw_tensors = [layers_h[k], *_weights[i : i + 4]] + fw_weights_dict = dict(zip(names, fw_tensors)) + layer_weights_dicts.append([fw_weights_dict]) + k += 1 + else: + names = ["hidden_state", "w_inp", "w_hid"] + if bidirectional: + rsd = len(_weights) % (2 * weights_num) + assert rsd == 0, "got an incorrect number of RNN weights" + for i in range(0, len(_weights), 2 * weights_num): + fw_tensors = [layers_h[2 * k], *_weights[i : i + 2]] + fw_weights_dict = dict(zip(names, fw_tensors)) + j = i + weights_num + rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 2]] + rev_weights_dict = dict(zip(names, rev_tensors)) + layer_weights_dicts.append([fw_weights_dict, rev_weights_dict]) + k += 1 + else: + assert len(_weights) % weights_num == 0, "got an incorrect number of RNN weights" + for i in range(0, len(_weights), weights_num): + fw_tensors = [layers_h[k], *_weights[i : i + 2]] + fw_weights_dict = dict(zip(names, fw_tensors)) + layer_weights_dicts.append([fw_weights_dict]) + k += 1 + assert ( + len(layer_weights_dicts) == num_layers and k == num_layers + ), "For stacked RNN number of weights sets should be the same as number of layers!" + output, out_hidden_state = self.rnn_layers( + X, + layer_weights_dicts, + bidirectional, + act, + dropout_p=dropout_p, + ) + + # output shape = (seq_num, batch, hidden_size) or + # (seq_num, batch, 2*feature_size) for bidirectional + if batch_first: + output = _op.transpose(output, (1, 0, 2)) + + return (output, out_hidden_state) + def bidir_gru_cell( self, input_seqs, @@ -3442,6 +3627,8 @@ def create_convert_map(self): "aten::l1_loss": self.l1_loss, "aten::mse_loss": self.mse_loss, "aten::flip": self.flip, + "aten::rnn_tanh": functools.partial(self.rnn, nonlinearity="tanh"), + "aten::rnn_relu": functools.partial(self.rnn, nonlinearity="relu"), "aten::gru": self.gru, "aten::lstm": self.lstm, "aten::all": functools.partial(self.all_any_common, _op.all), diff --git a/tests/python/frontend/pytorch/test_rnns.py b/tests/python/frontend/pytorch/test_rnns.py index b0180a7a99d4b..fba55b9c4c8f6 100644 --- a/tests/python/frontend/pytorch/test_rnns.py +++ b/tests/python/frontend/pytorch/test_rnns.py @@ -40,6 +40,10 @@ seqs_length = 2 batch_size = 2 +##RNN parameters +rnn_feature_size = 8 +rnn_hidden_size = 16 + class RNN_Model(nn.Module): """ @@ -93,6 +97,72 @@ def get_tvm_inputs(self, dtype): raise NotImplementedError("subclasses must override get_tvm_inputs(dtype)!") +class RNN_Model_Impl(RNN_Model): + def __init__( + self, + seq_len=seqs_length, + batch_size=batch_size, + feature_size=rnn_feature_size, + hidden_size=rnn_hidden_size, + batch_first=False, + layer_num=1, + bidirectional=False, + use_bias=True, + rnd_weights_init=False, + nonlinearity="tanh", + dropout=0.0, + ): + super().__init__() + # Shapes + self.shape = [seq_len, batch_size, feature_size] + if batch_first: + self.shape = [batch_size, seq_len, feature_size] + layers_num = 2 * layer_num if bidirectional else layer_num + self.h0_shape = [layers_num, batch_size, hidden_size] + # Dummy inputs + self.dummy_inputs = (torch.rand(self.shape), torch.zeros(self.h0_shape)) + + self.model = nn.RNN( + input_size=feature_size, + hidden_size=hidden_size, + num_layers=layer_num, + nonlinearity=nonlinearity, + bias=use_bias, + batch_first=batch_first, + dropout=dropout, + bidirectional=bidirectional, + ) + + if rnd_weights_init: + self.gen_rnd_weights() + + def gen_rnd_weights(self): + super().gen_rnd_weights() + + def get_dummy_inputs(self): + return self.dummy_inputs + + def get_input_names(self): + return ["input", "h0"] + + def get_shape_desc(self, frontend_type): + shape_desc = None + if frontend_type == "pt": # PyTorch + shape_desc = [("input", self.shape)] + elif frontend_type == "onnx": # ONNX + shape_desc = { + "input": self.shape, + "h0": self.h0_shape, + } + return shape_desc + + def get_tvm_inputs(self, dtype): + return { + "input": tvm.nd.array(self.dummy_inputs[0].numpy().astype(dtype)), + "h0": tvm.nd.array(self.dummy_inputs[1].numpy().astype(dtype)), + } + + class GRU_Model(RNN_Model): def __init__( self, @@ -331,6 +401,10 @@ def get_model( args["bidirectional"] = True if "s" in rnn_mod: args["layer_num"] = num_layers + if "tanh" in rnn_mod: + args["nonlinearity"] = "tanh" + if "relu" in rnn_mod: + args["nonlinearity"] = "relu" if rnn_type == "GRU": RNN_Model_selector = GRU_Model @@ -338,6 +412,8 @@ def get_model( RNN_Model_selector = LSTM_Model if "p" in rnn_mod: args["proj_size"] = lstm_projection_size + elif rnn_type == "RNN": + RNN_Model_selector = RNN_Model_Impl return RNN_Model_selector(**args) @@ -425,6 +501,9 @@ def test_rnns(): for mod_type in ["uni", "s", "b", "sb"]: check_rnn("LSTM", mod_type, target, dev) + for mod_type in ["uni", "s", "b", "sb", "tanh", "relu"]: + check_rnn("RNN", mod_type, target, dev) + if __name__ == "__main__": test_rnns() From b9aa3564dcde74a4d8aa7b70cd6f09ed476cb67c Mon Sep 17 00:00:00 2001 From: wrongtest Date: Fri, 8 Jul 2022 00:29:58 +0800 Subject: [PATCH 063/111] [TIR] Revert #11428 and move loop dependent alloc extent check after region union (#12019) --- src/tir/transforms/compact_buffer_region.cc | 58 +++++++++--------- ...est_tir_transform_compact_buffer_region.py | 60 +++++++++++++++++++ 2 files changed, 90 insertions(+), 28 deletions(-) diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc index 46f64d4edf094..2844f1b35e9e8 100644 --- a/src/tir/transforms/compact_buffer_region.cc +++ b/src/tir/transforms/compact_buffer_region.cc @@ -45,17 +45,36 @@ using support::NDIntSet; * \brief simplify and return the region collected by NDIntSet. return the original * buffer shape if the int_set is empty. */ -Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set, - const Array& original_shape, - arith::Analyzer* analyzer) { +Region SimplifyAndNarrowBufferRegionFromNDIntSet( + const NDIntSet& nd_int_set, const Array& original_shape, arith::Analyzer* analyzer, + const std::vector& ancestor_loops) { Array result; result.reserve(nd_int_set.size()); for (size_t i = 0; i < nd_int_set.size(); ++i) { const arith::IntSet& int_set = nd_int_set[i]; Range range = int_set.CoverRange(Range(/*begin=*/0, /*end=*/original_shape[i])); - result.push_back( - Range::FromMinExtent(analyzer->Simplify(max(0, range->min)), - analyzer->Simplify(min(original_shape[i], range->extent)))); + PrimExpr min = analyzer->Simplify(tvm::max(0, range->min)); + PrimExpr extent = analyzer->Simplify(tvm::min(original_shape[i], range->extent)); + + // Check the buffer region is not loop dependent, since loop dependent + // allocation is not supported yet. + auto is_loop_var = [&ancestor_loops](const VarNode* v) { + return std::any_of(ancestor_loops.begin(), ancestor_loops.end(), + [v](const ForNode* n) { return n->loop_var.get() == v; }); + }; + if (UsesVar(extent, is_loop_var)) { + // try estimate a constant upperbound on region's extent + int64_t upperbound = analyzer->const_int_bound(extent)->max_value; + if (upperbound != arith::ConstIntBound::kPosInf) { + extent = make_const(extent->dtype, upperbound); + } else { + // or else we have to fallback to full region + min = make_zero(original_shape[i]->dtype); + extent = original_shape[i]; + } + } + + result.push_back(Range::FromMinExtent(min, extent)); } return result; } @@ -63,7 +82,6 @@ Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set, /*! \brief a more constrained bound estimate for n-dimentional int set */ NDIntSet NDIntSetEval(Region region, PrimExpr predicate, const std::unordered_map& dom_map, - const std::vector& ancestor_loop_vars, arith::Analyzer* analyzer) { std::unordered_map var_dom; for (const auto& it : dom_map) { @@ -72,21 +90,7 @@ NDIntSet NDIntSetEval(Region region, PrimExpr predicate, Optional> eval_res = arith::EstimateRegionLowerBound(region, var_dom, predicate, analyzer); if (eval_res.defined()) { - NDIntSet res(0); - for (const auto& it : eval_res.value()) { - PrimExpr extent = analyzer->Simplify(it.max() - it.min() + 1); - // skip accurate region analysis result if there are outer loop dependencies. - if (UsesVar(extent, [&ancestor_loop_vars](const VarNode* v) { - return std::find(ancestor_loop_vars.begin(), ancestor_loop_vars.end(), v) != - ancestor_loop_vars.end(); - })) { - break; - } - res.push_back(it); - } - if (res.size() == region.size()) { - return res; - } + return NDIntSet(eval_res.value().begin(), eval_res.value().end()); } return support::NDIntSetEval(support::NDIntSetFromRegion(region), dom_map); } @@ -247,8 +251,8 @@ class BufferAccessRegionCollector : public StmtExprVisitor { ICHECK(it != relaxed_accesses_.end()) << buffer << " is allocated but not accessed within block scope"; const NDIntSet& nd_int_set = it->second; - buffer_access_region_[buffer] = - SimplifyAndNarrowBufferRegionFromNDIntSet(nd_int_set, buffer->shape, &dom_analyzer_); + buffer_access_region_[buffer] = SimplifyAndNarrowBufferRegionFromNDIntSet( + nd_int_set, buffer->shape, &dom_analyzer_, ancestor_loops_); } } @@ -270,7 +274,6 @@ class BufferAccessRegionCollector : public StmtExprVisitor { // Step 1. Stop ancestor loop vars out of the allocation block from // being relaxed unless NeedRelaxThread() is true. std::vector non_relaxed(n_ancestor_loops); - std::vector ancestor_loop_vars(n_ancestor_loops); for (size_t i = 0; i < n_ancestor_loops; ++i) { const ForNode* loop = ancestor_loops_[i]; const VarNode* v = loop->loop_var.get(); @@ -281,12 +284,11 @@ class BufferAccessRegionCollector : public StmtExprVisitor { ICHECK(dom_it != dom_map_.end()) << "Could not find domain for loop variable " << v->name_hint; non_relaxed[i] = dom_it->second; - ancestor_loop_vars[i] = v; dom_map_.erase(dom_it); } // Step 2. Relax the access region - NDIntSet nd_int_set = NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_, - ancestor_loop_vars, &dom_analyzer_); + NDIntSet nd_int_set = + NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_, &dom_analyzer_); // Step 3. Restore the non-relaxed ancestor loops domain for (size_t i = 0; i < n_ancestor_loops; ++i) { const VarNode* v = ancestor_loops_[i]->loop_var.get(); diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py index af206ef1862c0..5d8b99e7d0557 100644 --- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py +++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py @@ -849,5 +849,65 @@ def compacted_spatial_tiled_pad_and_pooling( _check(spatial_tiled_pad_and_pooling, compacted_spatial_tiled_pad_and_pooling) +def test_complex_case_1(): + """Meta-schedule matmul case for compact shared A, B matrix""" + + # fmt: off + @T.prim_func + def func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), "float32"], C: T.Buffer[(960, 2304), "float32"]) -> None: + for bx in T.thread_binding(144, thread="blockIdx.x"): + for vx in T.thread_binding(2, thread="vthread.x"): + for tx_p in T.thread_binding(256, thread="threadIdx.x"): + with T.block(): + for k_0 in T.serial(193): + with T.block(): + A_shared = T.alloc_buffer([960, 770], dtype="float32", scope="shared") + B_shared = T.alloc_buffer([770, 2304], dtype="float32", scope="shared") + for _u in T.serial(1): + for tx in T.thread_binding(256, thread="threadIdx.x"): + for vec in T.vectorized(3): + with T.block("A_shared"): + T.where(bx // 18 * 128 + ((_u * 256 + tx) * 3 + vec) // 4 < 960 and k_0 * 4 + ((_u * 256 + tx) * 3 + vec) % 4 < 770 and (_u * 256 + tx) * 3 + vec < 512) + A_shared[bx // 18 * 128 + (_u * 768 + tx * 3 + vec) // 4, k_0 * 4 + (_u * 768 + tx * 3 + vec) % 4] = A[bx // 18 * 128 + (_u * 768 + tx * 3 + vec) // 4, k_0 * 4 + (_u * 768 + tx * 3 + vec) % 4] + for _u in T.serial(1): + for tx in T.thread_binding(256, thread="threadIdx.x"): + for vec in T.vectorized(4): + with T.block("B_shared"): + T.where(k_0 * 4 + ((_u * 256 + tx) * 4 + vec) // 128 < 770 and (_u * 256 + tx) * 4 + vec < 512) + B_shared[k_0 * 4 + (_u * 1024 + tx * 4 + vec) // 128, bx % 18 * 128 + (_u * 1024 + tx * 4 + vec) % 128] = B[k_0 * 4 + (_u * 1024 + tx * 4 + vec) // 128, bx % 18 * 128 + (_u * 1024 + tx * 4 + vec) % 128] + for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(1, 8, 1, 4, 2, 2): + with T.block("update_update"): + C[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] = C[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] + A_shared[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, (k_0 + k_1) * 4 + k_2] * B_shared[(k_0 + k_1) * 4 + k_2, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] + + @T.prim_func + def compacted_func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), "float32"], C: T.Buffer[(960, 2304), "float32"]) -> None: + for bx in T.thread_binding(144, thread="blockIdx.x"): + for vx in T.thread_binding(2, thread="vthread.x"): + for tx_p in T.thread_binding(256, thread="threadIdx.x"): + with T.block(): + for k_0 in T.serial(193): + with T.block(): + A_shared = T.alloc_buffer([128, 4], dtype="float32", scope="shared") + B_shared = T.alloc_buffer([4, 128], dtype="float32", scope="shared") + for v_u in T.serial(1): + for tx in T.thread_binding(256, thread="threadIdx.x"): + for vec in T.vectorized(3): + with T.block("A_shared"): + T.where(bx // 18 * 128 + (tx * 3 + vec) // 4 < 960 and k_0 * 4 + (tx * 3 + vec) % 4 < 770 and tx * 3 + vec < 512) + A_shared[(tx * 3 + vec) // 4, (tx * 3 + vec) % 4] = A[bx // 18 * 128 + (tx * 3 + vec) // 4, k_0 * 4 + (tx * 3 + vec) % 4] + for v_u in T.serial(1): + for tx in T.thread_binding(256, thread="threadIdx.x"): + for vec in T.vectorized(4): + with T.block("B_shared"): + T.where(k_0 * 4 + tx // 32 < 770 and tx * 4 + vec < 512) + B_shared[tx // 32, tx % 32 * 4 + vec] = B[k_0 * 4 + tx // 32, bx % 18 * 128 + tx % 32 * 4 + vec] + for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(1, 8, 1, 4, 2, 2): + with T.block("update_update"): + C[bx // 18 * 128 + tx_p // 32 * 16 + i_3 * 2 + i_4, bx % 18 * 128 + vx * 64 + tx_p % 32 * 2 + j_4] = C[bx // 18 * 128 + tx_p // 32 * 16 + i_3 * 2 + i_4, bx % 18 * 128 + vx * 64 + tx_p % 32 * 2 + j_4] + A_shared[tx_p // 32 * 16 + i_3 * 2 + i_4, k_2] * B_shared[k_2, vx * 64 + tx_p % 32 * 2 + j_4] + # fmt: on + + _check(func, compacted_func) + + if __name__ == "__main__": tvm.testing.main() From 7874bf806eb98ec4b1e7a4265f695af5a12400b3 Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Thu, 7 Jul 2022 09:31:41 -0700 Subject: [PATCH 064/111] [MetaSchedule] Support ApplyHistoryBest Direct Dispatch (#12016) This PR introduced a new argument for `ApplyHistoryBest`'s `Query` interface to allow direct dispatch without querying the database, would be useful for debugging and benchmarking without interference. --- .../tvm/meta_schedule/apply_history_best.h | 8 +++++-- .../tvm/meta_schedule/apply_history_best.py | 9 ++++++-- src/meta_schedule/apply_history_best.cc | 10 +++++++- .../test_meta_schedule_integration.py | 23 +++++++++++++++++++ 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h index 8405ebbacf085..08c259ea18120 100644 --- a/include/tvm/meta_schedule/apply_history_best.h +++ b/include/tvm/meta_schedule/apply_history_best.h @@ -44,6 +44,7 @@ class ApplyHistoryBestNode : public runtime::Object { runtime::TypedPackedFunc(const Array&)>; /*! \brief A callback function that takes a tuning record and does something with it */ using FTakeTuningRecord = runtime::TypedPackedFunc; + using FDirectDispatch = runtime::TypedPackedFunc(const IRModule&)>; /*! \brief The database to be queried from */ Database database{nullptr}; @@ -64,11 +65,14 @@ class ApplyHistoryBestNode : public runtime::Object { * \param target The target to be queried * \param dispatched The IRs after dispatch * \param f_take_tuning_record A callback function that takes a tuning record and does something - * with it + * with it. + * \param f_direct_dispatch A function that directly dispatches an IRModule to the given workload + * as result if available, skipping the database query. */ Optional Query(runtime::String task_name, IRModule mod, Target target, Optional> dispatched, - FTakeTuningRecord f_take_tuning_record); + FTakeTuningRecord f_take_tuning_record, + FDirectDispatch f_direct_dispatch = nullptr); static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest"; TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, runtime::Object); diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py index 1a8ab2d358397..43a6ffe37620a 100644 --- a/python/tvm/meta_schedule/apply_history_best.py +++ b/python/tvm/meta_schedule/apply_history_best.py @@ -71,7 +71,8 @@ def query( mod: IRModule, target: Target, dispatched: Optional[List[IRModule]], - f_take_tuning_record: Callable[[TuningRecord], None] = None, + f_take_tuning_record: Optional[Callable[[TuningRecord], None]] = None, + f_direct_dispatch: Optional[Callable[[IRModule], Optional[IRModule]]] = None, ) -> Union[IRModule, None]: """The entry point of the integration @@ -85,8 +86,11 @@ def query( Target Info dispatched : Optional[List[IRModule]] A list of low-level IRs that the high-level IR could potentially dispatch to - f_take_tuning_record : Callable[[TuningRecord], None] = None + f_take_tuning_record : Optional[Callable[[TuningRecord], None]] = None A callback function that takes a tuning record and does something with it + f_direct_dispatch : Optional[Callable[[IRModule], Optional[IRModule]]] = None + A function that directly dispatches an IRModule to the given workload as result if + available, skipping the database query. Returns ------- @@ -101,6 +105,7 @@ def query( target, dispatched, f_take_tuning_record, + f_direct_dispatch, ) @staticmethod diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc index 22445a9cf76a8..62db293067774 100644 --- a/src/meta_schedule/apply_history_best.cc +++ b/src/meta_schedule/apply_history_best.cc @@ -104,7 +104,8 @@ ApplyHistoryBest::ApplyHistoryBest(Database database, Optional ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod, Target target, Optional> dispatched, - FTakeTuningRecord f_take_tuning_record) { + FTakeTuningRecord f_take_tuning_record, + FDirectDispatch f_direct_dispatch) { ICHECK(dispatched.defined()); ICHECK_EQ(dispatched.value().size(), 1); ICHECK(HasOnlyOneFunction(mod)) << mod; @@ -119,6 +120,13 @@ Optional ApplyHistoryBestNode::Query(runtime::String task_name, IRModu ICHECK(parse_mod_func) << "Parse mod function not defined!"; prim_mod = (*parse_mod_func)(prim_mod); + if (f_direct_dispatch != nullptr) { + Optional mod = f_direct_dispatch(prim_mod); + if (mod.defined()) { + TVM_PY_LOG(INFO, logging_func) << "Direct dispatch applied for workload: " << task_name; + return mod.value(); + } + } if (database->HasWorkload(prim_mod)) { Array records = database->GetTopK(database->CommitWorkload(prim_mod), 1); if (records.size() == 1) { diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py index 4868640adeadf..50456dfd24421 100644 --- a/tests/python/unittest/test_meta_schedule_integration.py +++ b/tests/python/unittest/test_meta_schedule_integration.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """Integration test for MetaSchedule""" +from typing import Optional import numpy as np import pytest import tvm @@ -287,6 +288,28 @@ def test_meta_schedule_integration_apply_history_best(): assert tvm.ir.structural_equal(mod, workload.mod) +@requires_torch +def test_meta_schedule_integration_apply_history_best_direct_dispatch(): + def direct_dispatch(mod: IRModule) -> Optional[IRModule]: + if tvm.ir.structural_equal(mod, MockModule): + return MockModule + return None + + mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224]) + database = ms.database.MemoryDatabase() + env = ms.ApplyHistoryBest(database) + target = Target("llvm") + workload = database.commit_workload(MockModule) + mod = env.query( + task_name="mock-task-direct-dispatch", + mod=mod, + target=target, + dispatched=[MockModule], + f_direct_dispatch=direct_dispatch, + ) + assert tvm.ir.structural_equal(mod, workload.mod) + + @pytest.mark.skip("Too slow on CI") def extract_task_qbert(): mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128) From c76d8e2bdb0a11cbdc30bdfd631963ba9813662a Mon Sep 17 00:00:00 2001 From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com> Date: Thu, 7 Jul 2022 23:49:01 +0530 Subject: [PATCH 065/111] [TOPI] [Hexagon] Reshape slice op (#11983) * Reshape slice op. This patch adds the initial python implementation reshape slice op for hexagon. * Add tests for reshape op --- python/tvm/topi/hexagon/slice_ops/__init__.py | 1 + python/tvm/topi/hexagon/slice_ops/reshape.py | 108 +++++++++++ .../test_hexagon/topi/test_batch_flatten.py | 101 ----------- .../contrib/test_hexagon/topi/test_reshape.py | 168 ++++++++++++++++++ 4 files changed, 277 insertions(+), 101 deletions(-) mode change 100755 => 100644 python/tvm/topi/hexagon/slice_ops/__init__.py create mode 100644 python/tvm/topi/hexagon/slice_ops/reshape.py delete mode 100644 tests/python/contrib/test_hexagon/topi/test_batch_flatten.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_reshape.py diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py old mode 100755 new mode 100644 index ce1641bfda35a..617aaed920d7d --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -24,3 +24,4 @@ from .softmax_slice import * from .clip import * from .conv2d import * +from .reshape import reshape_compute, reshape_stir_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/reshape.py b/python/tvm/topi/hexagon/slice_ops/reshape.py new file mode 100644 index 0000000000000..374c20bb72df8 --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/reshape.py @@ -0,0 +1,108 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Hexagon slice reshape compute and schedule""" +from tvm import te, tir, topi +from ..utils import get_layout_transform_fn + + +def reshape_compute(inp: te.Tensor, new_shape: tuple) -> te.Tensor: + """Compute for slice reshape op for hexagon. + This op makes the following assumptions: + 1. This op is written for a sliced reshape operation. + 2. The input is assumed to be in NHWC layout. + + Parameters + ---------- + Input : te.Tensor + Input tensor + New Shape: tuple + Output shape + Returns + ------- + Output : te.Tensor + Output of applying reshape operation on input + """ + return topi.transform.reshape(inp, new_shape) + + +def stir_schedule_nhwc_1024c( + out: te.Tensor, + inp: te.Tensor, + out_layout: str, + in_layout: str, +) -> tir.Schedule: + """Schedule for output layout: nhwc-1024c-2d""" + reshape_func = te.create_prim_func([inp, out]) + sch = tir.Schedule(reshape_func, debug_mask="all") + compute = sch.get_block("T_reshape") + + sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout)) + sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout)) + i, j = sch.get_loops(compute) + jout, channel = sch.split(j, [None, inp.shape[3]]) + height, width = sch.split(jout, [inp.shape[1], inp.shape[2]]) + channelo, channeli = sch.split(channel, [None, 1024]) + channelio, channelii = sch.split(channeli, [None, 64]) + sch.reorder(i, height, width, channelo, channelio, channelii) + sch.vectorize(channelii) + return sch + + +def stir_schedule_nhwc_8h2w32c2w( + out: te.Tensor, + inp: te.Tensor, + out_layout: str, + in_layout: str, +) -> tir.Schedule: + """Schedule for input and output layout nhwc-8h2w32c2w""" + reshape_func = te.create_prim_func([inp, out]) + sch = tir.Schedule(reshape_func, debug_mask="all") + compute = sch.get_block("T_reshape") + + sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout)) + sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout)) + return sch + + +def reshape_stir_schedule( + out: te.Tensor, + inp: te.Tensor, + output_layout: str, + input_layout: str, +) -> tir.Schedule: + """STIR schedule definition for the compute of reshape compute. + Parameters + ---------- + outputs : te.Tensor + The output tensor as returned by a call to reshape_compute + input : te.Tensor + Input tensor to reshape + out_layout: str + The transformation function definition for the expected output layout + in_layout: str + The transformation function definition for the input layout + Returns + ------- + sch : tvm.tir.Schedule + The STIR schedule for slice reshape compute + """ + if output_layout == "nhwc-8h2w32c2w-2d": + return stir_schedule_nhwc_8h2w32c2w(out, inp, output_layout, input_layout) + if output_layout == "nc-1024-2d": + return stir_schedule_nhwc_1024c(out, inp, output_layout, input_layout) + raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py b/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py deleted file mode 100644 index 3a056116d45c1..0000000000000 --- a/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py +++ /dev/null @@ -1,101 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -import pytest - -import tvm -import tvm.testing -import tvm.topi.hexagon.slice_ops as sl -from tvm import te, topi -from tvm.contrib.hexagon.build import HexagonLauncher -from tvm.topi import testing - -from ..infrastructure import allocate_hexagon_array, transform_numpy - - -class BaseTestBatchFlatten: - input_shape = tvm.testing.parameter( - (1, 1, 1, 2048), - (1, 2, 4, 2048), - (1, 8, 8, 1024), - (2, 4, 8, 1024), - (2, 3, 5, 2048), - ) - input_layout, input_axis_sep = tvm.testing.parameters(("nhwc-1024c-2d", [4])) - output_layout, output_axis_sep = tvm.testing.parameters(("nc-1024-2d", [2])) - data_type = tvm.testing.parameter("float16") - - -class TestBatchFlatten(BaseTestBatchFlatten): - @tvm.testing.fixture - def output_shape(self, input_shape): - return input_shape[0], input_shape[1] * input_shape[2] * input_shape[3] - - @tvm.testing.requires_hexagon - def test_batch_flatten( - self, - data_type, - input_shape, - input_layout, - input_axis_sep, - output_shape, - output_layout, - output_axis_sep, - hexagon_session, - ): - target_hexagon = tvm.target.hexagon("v69") - target = tvm.target.Target(target_hexagon, host=target_hexagon) - A = te.placeholder(input_shape, name="A", dtype=data_type) - D = sl.batch_flatten_compute(A) - tir_s = sl.batch_flatten_stir_schedule( - D, - A, - output_layout, - input_layout, - ) - func_name = "batch_flatten" - with tvm.transform.PassContext(opt_level=3): - runtime_module = tvm.build(tir_s.mod, target=target, name=func_name) - - mod = hexagon_session.load_module(runtime_module) - - a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type) - ref = np.reshape(a_numpy, output_shape) - - input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout) - ref_np_transformed = transform_numpy(ref, "nhwc", output_layout) - - a_tvm = allocate_hexagon_array( - hexagon_session.device, - data=input_np_transformed, - axis_separators=input_axis_sep, - mem_scope="global.vtcm", - ) - output = allocate_hexagon_array( - hexagon_session.device, - ref_np_transformed.shape, - data_type, - axis_separators=output_axis_sep, - mem_scope="global.vtcm", - ) - mod(a_tvm, output) - np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0) - - -if __name__ == "__main__": - tvm.testing.main(pytest.main(sys.argv)) diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py new file mode 100644 index 0000000000000..2def86ad83398 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_reshape.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import pytest + +import tvm +import tvm.testing +import tvm.topi.hexagon.slice_ops as sl +from tvm import te, topi +from tvm.contrib.hexagon.build import HexagonLauncher +from tvm.topi import testing + +from ..infrastructure import allocate_hexagon_array, transform_numpy + + +def reshape_helper( + func, + fcompute, + fschedule, + data_type, + input_shape, + input_layout, + output_shape, + output_layout, + hexagon_session, +): + + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + A = te.placeholder(input_shape, name="A", dtype=data_type) + if func == "reshape": + D = fcompute(A, output_shape) + elif func == "batch_flatten": + D = fcompute(A) + else: + raise RuntimeError(f"Unexpected func'{func}'") + tir_s = fschedule( + D, + A, + output_layout, + input_layout, + ) + with tvm.transform.PassContext(opt_level=3): + print("output of tvm.lower", tvm.lower(tir_s.mod, name=func)) + runtime_module = tvm.build(tir_s.mod, target=target, name=func) + + mod = hexagon_session.load_module(runtime_module) + + a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type) + ref = np.reshape(a_numpy, output_shape) + + input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout) + ref_np_transformed = transform_numpy(ref, "nhwc", output_layout) + input_axis_sep = [4] + if output_layout == "nhwc-8h2w32c2w-2d": + output_axis_sep = [4] + elif output_layout == "nc-1024-2d": + output_axis_sep = [2] + else: + raise RuntimeError(f"Unexpected layout '{output_layout}'") + a_tvm = allocate_hexagon_array( + hexagon_session.device, + data=input_np_transformed, + axis_separators=input_axis_sep, + mem_scope="global.vtcm", + ) + output = allocate_hexagon_array( + hexagon_session.device, + ref_np_transformed.shape, + data_type, + axis_separators=output_axis_sep, + mem_scope="global.vtcm", + ) + mod(a_tvm, output) + np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0) + + +batch_flatten_tests = ( + ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"), + ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"), + ([1, 8, 8, 1024], [1, 8 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"), + ([2, 4, 8, 1024], [2, 4 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"), +) + + +class BaseTestBatchFlatten: + ( + input_shape, + output_shape, + input_layout, + output_layout, + data_type, + ) = tvm.testing.parameters(*batch_flatten_tests) + + +class TestBatchFlatten(BaseTestBatchFlatten): + @tvm.testing.requires_hexagon + def test_batch_flatten( + self, + data_type, + input_shape, + input_layout, + output_shape, + output_layout, + hexagon_session, + ): + reshape_helper( + "batch_flatten", + sl.batch_flatten_compute, + sl.batch_flatten_stir_schedule, + data_type, + input_shape, + input_layout, + output_shape, + output_layout, + hexagon_session, + ) + + +class BaseTestReshape(BaseTestBatchFlatten): + (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters( + *batch_flatten_tests, + ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"), + ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"), + ) + + +class TestReshape(BaseTestReshape): + @tvm.testing.requires_hexagon + def test_reshape( + self, + data_type, + input_shape, + input_layout, + output_shape, + output_layout, + hexagon_session, + ): + reshape_helper( + "reshape", + sl.reshape_compute, + sl.reshape_stir_schedule, + data_type, + input_shape, + input_layout, + output_shape, + output_layout, + hexagon_session, + ) + + +if __name__ == "__main__": + tvm.testing.main() From af4373f2fbb6afa6827f7e3e4964bad644644d39 Mon Sep 17 00:00:00 2001 From: Zhengqiang Yin Date: Fri, 8 Jul 2022 03:38:27 +0800 Subject: [PATCH 066/111] [Fix] fix python setup.py file bug (#12000) * fix setup.py bug Signed-off-by: Zhengqiang Yin * remove data_files field * keep a init setup_kwargs --- python/setup.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/python/setup.py b/python/setup.py index 87f533a329c6b..a7ba115e0a765 100644 --- a/python/setup.py +++ b/python/setup.py @@ -164,18 +164,8 @@ def is_pure(self): return False -include_libs = False -wheel_include_libs = False -if not CONDA_BUILD: - if "bdist_wheel" in sys.argv: - wheel_include_libs = True - else: - include_libs = True - setup_kwargs = {} - -# For bdist_wheel only -if wheel_include_libs: +if not CONDA_BUILD: with open("MANIFEST.in", "w") as fo: for path in LIB_LIST: if os.path.isfile(path): @@ -190,12 +180,6 @@ def is_pure(self): setup_kwargs = {"include_package_data": True} -if include_libs: - curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) - for i, path in enumerate(LIB_LIST): - LIB_LIST[i] = os.path.relpath(path, curr_path) - setup_kwargs = {"include_package_data": True, "data_files": [("tvm", LIB_LIST)]} - def get_package_data_files(): # Relay standard libraries @@ -253,7 +237,7 @@ def long_description_contents(): ) -if wheel_include_libs: +if not CONDA_BUILD: # Wheel cleanup os.remove("MANIFEST.in") for path in LIB_LIST: From 013d5e8fcbd94fb3a0c5c0cdcaea03af43c464aa Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Thu, 7 Jul 2022 12:44:18 -0700 Subject: [PATCH 067/111] [MetaSchedule][Minor] Stability Improvements (#12014) * Fix tuning util for uint8. * Change to check runner_result. * Revert change to let cost model learn. --- .../tvm/meta_schedule/testing/tune_utils.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py index aad8496a4661f..fe0984d51c509 100644 --- a/python/tvm/meta_schedule/testing/tune_utils.py +++ b/python/tvm/meta_schedule/testing/tune_utils.py @@ -48,21 +48,21 @@ def generate_input_data( """ if input_dtype.startswith("float"): return np.random.uniform(size=input_shape).astype(input_dtype) - if input_dtype in ["uint8", "int8"]: - return np.random.randint( - low=0, - high=127, - size=input_shape, - dtype="int32", # TODO(zxybazh): fix the datatype when int8 / uint8 is supported better + if low is None or high is None: + warnings.warn( + f"Model input value range for shape {input_shape} of {input_dtype} is not set!" ) - if input_dtype in ["int32", "int64"]: - if low is None or high is None: - warnings.warn( - "Model input value range for shape {input_shape} of {input_dtype} is not set!" - ) + range_map = { + "uint8": (0, 255), + "int8": (-128, 127), + "int32": (0, 10000), + "int64": (0, 10000), + } + if input_dtype in range_map: + _low, _high = range_map[input_dtype] return np.random.randint( - low=0 if low is None else low, - high=10000 if high is None else high, + low=_low if low is None else low, + high=_high if high is None else high, size=input_shape, dtype=input_dtype, ) From 99d42b22382d19cfd2e2e1ec65d92f1fe41e4c10 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Thu, 7 Jul 2022 14:05:59 -0700 Subject: [PATCH 068/111] [MetaSchedule][Testing] Test search space of conv1d (#12032) * [MetaSchedule][Testing] Test search space of conv1d * Add checks for trace roundtripping --- .../meta_schedule/testing/space_generation.py | 65 +++++++++- .../unittest/test_meta_schedule_space_cuda.py | 115 ++++++++++++++++++ 2 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 tests/python/unittest/test_meta_schedule_space_cuda.py diff --git a/python/tvm/meta_schedule/testing/space_generation.py b/python/tvm/meta_schedule/testing/space_generation.py index 10e31e7213cbd..2d846e244a86c 100644 --- a/python/tvm/meta_schedule/testing/space_generation.py +++ b/python/tvm/meta_schedule/testing/space_generation.py @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring -from typing import List +from typing import List, Optional, Tuple +from tvm.ir import IRModule, structural_equal from tvm.tir import Schedule from tvm.tir.schedule import Trace +from tvm.tir.schedule.testing import verify_trace_roundtrip def check_trace(spaces: List[Schedule], expected: List[List[str]]): @@ -31,3 +33,64 @@ def check_trace(spaces: List[Schedule], expected: List[List[str]]): actual_traces.add(str_trace) assert str_trace in expected_traces, "\n" + str_trace assert len(expected_traces) == len(actual_traces) + + +def _find_match_sketch_id( + mod: IRModule, + sketches: List[Schedule], + expected_mod: IRModule, + expected_decision: List[Tuple[str, List[int]]], +) -> Optional[int]: + for sketch_id, sketch in enumerate(sketches): + i = 0 + new_decisions = {} + for inst in sketch.trace.insts: + if not inst.kind.name.startswith("Sample"): + continue + assert i < len(expected_decision) + if inst.kind.name == expected_decision[i][0]: + new_decisions[inst] = expected_decision[i][1] + i += 1 + if len(new_decisions) != len(expected_decision): + continue + sch = Schedule(mod, debug_mask="all") + Trace( + insts=sketch.trace.insts, + decisions=new_decisions, + ).apply_to_schedule(sch, remove_postproc=True) + if structural_equal(sch.mod, expected_mod): + verify_trace_roundtrip(sch=sch, mod=mod) + return sketch_id + return None + + +def check_sketches( + mod: IRModule, + sketches: List[Schedule], + expected_mods: List[IRModule], + expected_decisions: List[List[Tuple[str, List[int]]]], +): + assert len(expected_mods) == len(expected_decisions) + assert len(sketches) == len(expected_mods) + expected_mods = [ + IRModule({"main": m}) if not isinstance(m, IRModule) else m for m in expected_mods + ] + sketches = list(sketches) + for expected_id, (expected_mod, expected_decision) in enumerate( + zip(expected_mods, expected_decisions) + ): + sketch_id = _find_match_sketch_id(mod, sketches, expected_mod, expected_decision) + if sketch_id is None: + raise AssertionError( + f"Expected sketch #{expected_id} doesn't exist in the generated sketches." + ) + sketches.pop(sketch_id) + + +def print_sketches(sketches: List[Schedule]): + for i, sch in enumerate(sketches): + print(f"###### {i}") + print(sch.mod.script()) + for inst in sch.trace.insts: + if inst in sch.trace.decisions: + print(f'("{inst.kind.name}", {sch.trace.decisions[inst]}),') diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py new file mode 100644 index 0000000000000..e2c324cfda521 --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -0,0 +1,115 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for MetaSchedule search space on CUDA""" +from tvm import meta_schedule as ms +from tvm.meta_schedule.testing.space_generation import check_sketches +from tvm.meta_schedule.testing.te_workload import create_te_workload +from tvm.script import tir as T +from tvm.target import Target + + +def _target(): + return Target("nvidia/geforce-rtx-3070") + + +def test_cuda_c1d(): + # fmt: off + @T.prim_func + def mod_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.unroll_explicit":16}) + conv1d_nlc_local = T.alloc_buffer([1, 128, 128], dtype="float32", scope="local") + PadInput_shared = T.alloc_buffer([1, 258, 64], dtype="float32", scope="shared") + weight_shared = T.alloc_buffer([3, 64, 128], dtype="float32", scope="shared") + for i0_0_i1_0_i2_0_fused in T.thread_binding(4, thread="blockIdx.x"): + for i0_1_i1_1_i2_1_fused in T.thread_binding(16, thread="vthread.x"): + for i0_2_i1_2_i2_2_fused in T.thread_binding(4, thread="threadIdx.x"): + for i3_0, i4_0 in T.grid(1, 16): + for ax0_ax1_ax2_fused in T.serial(260): + with T.block("PadInput_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused % 260 // 4) + v2 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 4) + T.reads(inputs[v0, v1 - 1, v2]) + T.writes(PadInput_shared[v0, v1, v2]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + PadInput_shared[v0, v1, v2] = T.if_then_else(1 <= v1 and v1 < 257, inputs[v0, v1 - 1, v2], T.float32(0), dtype="float32") + for ax0_ax1_ax2_fused in T.serial(1536): + with T.block("weight_shared"): + v0 = T.axis.spatial(3, ax0_ax1_ax2_fused // 512) + v1 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 512 // 128) + v2 = T.axis.spatial(128, ax0_ax1_ax2_fused % 128) + T.reads(weight[v0, v1, v2]) + T.writes(weight_shared[v0, v1, v2]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + weight_shared[v0, v1, v2] = weight[v0, v1, v2] + for i3_1, i4_1, i0_3, i1_3, i2_3, i3_2, i4_2, i0_4, i1_4, i2_4 in T.grid(1, 2, 1, 1, 2, 3, 2, 1, 4, 8): + with T.block("conv1d_nlc"): + n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0) + l = T.axis.spatial(128, (i0_0_i1_0_i2_0_fused % 4 * 8 + i0_1_i1_1_i2_1_fused % 16 // 2 + 0 + i1_3) * 4 + i1_4) + co = T.axis.spatial(128, (((0 * 2 + i0_1_i1_1_i2_1_fused % 2) * 4 + i0_2_i1_2_i2_2_fused % 4) * 2 + i2_3) * 8 + i2_4) + rl = T.axis.reduce(3, (i3_0 + i3_1) * 3 + i3_2) + rc = T.axis.reduce(64, (i4_0 * 2 + i4_1) * 2 + i4_2) + T.reads(PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc], weight_shared[rl, rc, co]) + T.writes(conv1d_nlc_local[n, l, co]) + T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) + with T.init(): + conv1d_nlc_local[n, l, co] = T.float32(0) + conv1d_nlc_local[n, l, co] = conv1d_nlc_local[n, l, co] + PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc] * weight_shared[rl, rc, co] + for ax0, ax1, ax2 in T.grid(1, 4, 16): + with T.block("conv1d_nlc_local"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(128, i0_0_i1_0_i2_0_fused * 32 + i0_1_i1_1_i2_1_fused // 2 * 4 + ax1) + v2 = T.axis.spatial(128, i0_1_i1_1_i2_1_fused % 2 * 64 + i0_2_i1_2_i2_2_fused * 16 + ax2) + T.reads(conv1d_nlc_local[v0, v1, v2]) + T.writes(conv1d_nlc[v0, v1, v2]) + conv1d_nlc[v0, v1, v2] = conv1d_nlc_local[v0, v1, v2] + # fmt: on + + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [4, 8, 1, 1, 4]), + ("SamplePerfectTile", [1, 2, 4, 2, 8]), + ("SamplePerfectTile", [1, 1, 3]), + ("SamplePerfectTile", [16, 2, 2]), + ("SampleCategorical", 3), + ("SampleCategorical", 2), + ("SampleCategorical", 1), + ] + + mod = create_te_workload("C1D", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[mod_0], + expected_decisions=[decision_0], + ) + + +if __name__ == "__main__": + test_cuda_c1d() From a8e329443db3cc5abe0ae2870dde1eebe26030ac Mon Sep 17 00:00:00 2001 From: AndrewZhaoLuo Date: Thu, 7 Jul 2022 16:45:17 -0700 Subject: [PATCH 069/111] [Pylint] Pylint integration_tests folder (#11672) * add folder to pylint * add init py * lint test_arm_mrpofile_dsp.py * one more change to tests/python/integratoin/test_arm_mprofile_dsp.py * add test_dot * test_ewise_fpga.py * test_ewise.py * test gemm * test_lower.py * test_meta_schedule_auto_tensorize.py * test_reduce.py pt1 * test_reduce.py pt2 * test_scan.py * test_tuning.py * test_winograd_nnpack.py * final test pass * comments * clean up test_lower more --- tests/lint/pylint.sh | 1 + tests/python/integration/__init__.py | 17 + .../integration/test_arm_mprofile_dsp.py | 10 +- tests/python/integration/test_dot.py | 43 +- tests/python/integration/test_ewise.py | 278 +++++---- tests/python/integration/test_ewise_fpga.py | 75 ++- tests/python/integration/test_gemm.py | 115 ++-- tests/python/integration/test_lower.py | 360 ++++++----- .../test_meta_schedule_auto_tensorize.py | 61 +- tests/python/integration/test_reduce.py | 585 ++++++++++-------- tests/python/integration/test_scan.py | 59 +- tests/python/integration/test_tuning.py | 188 +++--- .../integration/test_winograd_nnpack.py | 67 +- 13 files changed, 1089 insertions(+), 770 deletions(-) create mode 100644 tests/python/integration/__init__.py diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh index 39568fd3417ef..61ffb0fd92542 100755 --- a/tests/lint/pylint.sh +++ b/tests/lint/pylint.sh @@ -23,3 +23,4 @@ python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirna python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc +python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc diff --git a/tests/python/integration/__init__.py b/tests/python/integration/__init__.py new file mode 100644 index 0000000000000..56984ac615350 --- /dev/null +++ b/tests/python/integration/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Infrastructure and tests for e2e integration tests.""" diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py index 2bcf284f3d770..22b4ebaab832a 100644 --- a/tests/python/integration/test_arm_mprofile_dsp.py +++ b/tests/python/integration/test_arm_mprofile_dsp.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys +"""Test arm mprofile dsp.""" import numpy as np import pytest import tvm @@ -173,16 +173,16 @@ def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding, dtype @tvm.testing.requires_corstone300 @pytest.mark.parametrize( - "M, K, N", + "dim_m, dim_k, dim_n", [ (1, 32, 64), (3, 12, 10), ], ) -def test_dense(M, K, N): +def test_dense(dim_m, dim_k, dim_n): """Test a subgraph with a single dense operator.""" - ishape = (M, K) - wshape = (N, K) + ishape = (dim_m, dim_k) + wshape = (dim_n, dim_k) input0 = relay.var("input", relay.TensorType(ishape, "int8")) dense_f = relay.op.nn.batch_flatten(input0) diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py index 41abb51a2e994..20e628c8c14be 100644 --- a/tests/python/integration/test_dot.py +++ b/tests/python/integration/test_dot.py @@ -14,31 +14,46 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test scheduling and running a dot product.""" +import numpy as np + import tvm import tvm.testing from tvm import te -import numpy as np @tvm.testing.requires_llvm def test_dot(): - nn = 12 - n = tvm.runtime.convert(nn) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - k = te.reduce_axis((0, n), "k") - C = te.compute((), lambda: te.sum(A[k] * B[k], axis=k), name="C") - s = te.create_schedule(C.op) + """Test dot product.""" + arr_length = 12 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a = te.placeholder((arr_length_tvm,), name="A") + placeholder_b = te.placeholder((arr_length_tvm,), name="B") + reduce_axis_k = te.reduce_axis((0, arr_length_tvm), "k") + result_c = te.compute( + (), + lambda: te.sum( + placeholder_a[reduce_axis_k] * placeholder_b[reduce_axis_k], axis=reduce_axis_k + ), + name="C", + ) + schedule = te.create_schedule(result_c.op) def verify(target): - f = tvm.driver.build(s, [A, B, C], target) + f = tvm.driver.build(schedule, [placeholder_a, placeholder_b, result_c], target) # verify dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((), dtype=C.dtype), dev) - f(a, b, c) - tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(arr_length,)).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array( + np.random.uniform(size=(arr_length,)).astype(placeholder_b.dtype), dev + ) + buff_c = tvm.nd.array(np.zeros((), dtype=result_c.dtype), dev) + f(buff_a, buff_b, buff_c) + tvm.testing.assert_allclose( + buff_c.numpy(), np.dot(buff_a.numpy(), buff_b.numpy()), rtol=1e-4 + ) verify("llvm") diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index 3250efc3f71e9..8bfa6b17175db 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -14,26 +14,29 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test elementwise integration.""" +import numpy as np + import tvm +import tvm.testing from tvm import te from tvm.contrib import nvcc -import numpy as np -import time -import tvm.testing @tvm.testing.requires_gpu def test_exp(): + """Test scheduling and running exponent.""" # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B") - s = te.create_schedule(B.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a = te.placeholder((arr_length_tvm,), name="A") + placeholder_b = te.compute(placeholder_a.shape, lambda *i: te.exp(placeholder_a(*i)), name="B") + schedule = te.create_schedule(placeholder_b.op) # create iter var and assign them tags. num_thread = 8 - bx, tx = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) + axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread) + schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x")) + schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): @@ -43,14 +46,13 @@ def check_device(device, host="stackvm"): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fexp = tvm.build(s, [A, B], device, host, name="myexp") + fexp = tvm.build(schedule, [placeholder_a, placeholder_b], device, host, name="myexp") dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) - fexp(a, b) - tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5) + buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros(arr_length, dtype=placeholder_b.dtype), dev) + fexp(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5) check_device("opencl -device=intel_graphics") check_device("cuda", "llvm") @@ -59,16 +61,19 @@ def check_device(device, host="stackvm"): @tvm.testing.requires_gpu def test_fmod(): + """Test scheduling and running fmod.""" # graph def run(dtype): - n = te.size_var("n") - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.placeholder((n,), name="B", dtype=dtype) - C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name="C") - s = te.create_schedule(C.op) + size_var_n = te.size_var("n") + placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype) + placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype) + result_c = te.compute( + placeholder_a.shape, lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)), name="C" + ) + schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 8 - bx, tx = s[C].split(C.op.axis[0], factor=num_thread) + axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread) def check_device(device): dev = tvm.device(device, 0) @@ -77,26 +82,29 @@ def check_device(device): return target = tvm.target.Target(device) if "cpu" not in target.keys: - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - fmod = tvm.build(s, [A, B, C], device, name="myfmod") + schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x")) + fmod = tvm.build( + schedule, [placeholder_a, placeholder_b, result_c], device, name="myfmod" + ) # launch the kernel. - n = 1024 - a_np = (np.random.uniform(size=n) * 256).astype(A.dtype) - b_np = (np.random.uniform(size=n) * 256).astype(B.dtype) + value_n = 1024 + a_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_a.dtype) + b_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_b.dtype) # "fix" the values in a and b to avoid the result being too small b_np += (b_np < 2.0) * 2 a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1 - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev) ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1) - tcost = ftimer(a, b, c).mean - # fmod(a, b, c) - np.testing.assert_allclose(c.numpy(), np.mod(a.numpy(), b.numpy()), rtol=1e-5) + _ = ftimer(buff_a, buff_b, buff_c).mean + np.testing.assert_allclose( + buff_c.numpy(), np.mod(buff_a.numpy(), buff_b.numpy()), rtol=1e-5 + ) check_device("cuda") check_device("opencl -device=intel_graphics") @@ -107,21 +115,30 @@ def check_device(device): @tvm.testing.requires_gpu def test_multiple_cache_write(): + """Test multiple cache writes.""" # graph - n = tvm.runtime.convert(1024) - A0 = te.placeholder((n,), name="A0", dtype="float32") - A1 = te.placeholder((n,), name="A1", dtype="float32") - B0, B1 = te.compute((n,), lambda *i: (A0(*i) + A1(*i), A0(*i) * A1(*i)), name="B") - C = te.compute((n,), lambda *i: B0(*i) + B1(*i), name="C") - s = te.create_schedule(C.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a0 = te.placeholder((arr_length_tvm,), name="A0", dtype="float32") + placeholder_a1 = te.placeholder((arr_length_tvm,), name="A1", dtype="float32") + result_b0, result_b1 = te.compute( + (arr_length_tvm,), + lambda *i: ( + placeholder_a0(*i) + placeholder_a1(*i), + placeholder_a0(*i) * placeholder_a1(*i), + ), + name="B", + ) + result_c = te.compute((arr_length_tvm,), lambda *i: result_b0(*i) + result_b1(*i), name="C") + schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 8 - B0_cache, B1_cache = s.cache_write([B0, B1], "local") - bx, tx = s[C].split(C.op.axis[0], factor=num_thread) - s[B0].compute_at(s[C], bx) - s[B0_cache].compute_at(s[C], bx) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) + cache_b0, _ = schedule.cache_write([result_b0, result_b1], "local") + axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread) + schedule[result_b0].compute_at(schedule[result_c], axis0) + schedule[cache_b0].compute_at(schedule[result_c], axis0) + schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): if not tvm.testing.device_enabled(host): @@ -129,16 +146,23 @@ def check_device(device, host="stackvm"): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): return - func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write") + func = tvm.build( + schedule, + [placeholder_a0, placeholder_a1, result_c], + device, + host, + name="multiple_cache_write", + ) dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), dev) - a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - func(a0, a1, c) + buff_a0 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a0.dtype), dev) + buff_a1 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a1.dtype), dev) + buff_c = tvm.nd.array(np.zeros(arr_length, dtype=result_c.dtype), dev) + func(buff_a0, buff_a1, buff_c) tvm.testing.assert_allclose( - c.numpy(), a0.numpy() + a1.numpy() + (a0.numpy() * a1.numpy()), rtol=1e-5 + buff_c.numpy(), + buff_a0.numpy() + buff_a1.numpy() + (buff_a0.numpy() * buff_a1.numpy()), + rtol=1e-5, ) check_device("cuda", "llvm") @@ -147,41 +171,49 @@ def check_device(device, host="stackvm"): def test_log_pow_llvm(): + """Test log pow using llvm to lower.""" # graph - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: te.power(te.log(A(*i)), 2.0), name="B") - s = te.create_schedule(B.op) + size_var_n = te.size_var("n") + placeholder_a = te.placeholder((size_var_n,), name="A") + result_b = te.compute( + placeholder_a.shape, lambda *i: te.power(te.log(placeholder_a(*i)), 2.0), name="B" + ) + schedule = te.create_schedule(result_b.op) # create iter var and assign them tags. - bx, tx = s[B].split(B.op.axis[0], factor=32) + schedule[result_b].split(result_b.op.axis[0], factor=32) # one line to build the function. if not tvm.testing.device_enabled("llvm"): return - flog = tvm.build(s, [A, B], "llvm", name="mylog") + flog = tvm.build(schedule, [placeholder_a, result_b], "llvm", name="mylog") dev = tvm.cpu(0) # launch the kernel. - n = 1028 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) + size_var_n = 1028 + buff_a = tvm.nd.array(np.random.uniform(size=size_var_n).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros(size_var_n, dtype=result_b.dtype), dev) repeat = 10 ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat) - res = ftimer(a, b) + res = ftimer(buff_a, buff_b) assert len(res.results) == repeat - tvm.testing.assert_allclose(b.numpy(), np.power(np.log(a.numpy()), 2.0), rtol=1e-5) + tvm.testing.assert_allclose(buff_b.numpy(), np.power(np.log(buff_a.numpy()), 2.0), rtol=1e-5) @tvm.testing.uses_gpu def test_popcount(): + """Test popcount.""" + def run(dtype): # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.compute(A.shape, lambda *i: tvm.tir.popcount(A(*i)), name="B") - s = te.create_schedule(B.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(1024) + placeholder_a = te.placeholder((arr_length_tvm,), name="A", dtype=dtype) + placeholder_b = te.compute( + placeholder_a.shape, lambda *i: tvm.tir.popcount(placeholder_a(*i)), name="B" + ) + schedule = te.create_schedule(placeholder_b.op) # simple schedule num_thread = 8 - bx, tx = s[B].split(B.op.axis[0], factor=num_thread) + axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread) def check_device(device): dev = tvm.device(device, 0) @@ -190,16 +222,17 @@ def check_device(device): return target = tvm.target.Target(device) if "cpu" not in target.keys: - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) - func = tvm.build(s, [A, B], device) + schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x")) + schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x")) + func = tvm.build(schedule, [placeholder_a, placeholder_b], device) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), dev) - b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), dev) - func(a, b) + buff_a = tvm.nd.array( + np.random.randint(low=0, high=1000, size=arr_length, dtype=placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(shape=arr_length, dtype=placeholder_b.dtype), dev) + func(buff_a, buff_b) tvm.testing.assert_allclose( - b.numpy(), list(map(lambda x: bin(x).count("1"), a.numpy())), rtol=1e-5 + buff_b.numpy(), list(map(lambda x: bin(x).count("1"), buff_a.numpy())), rtol=1e-5 ) check_device("llvm") @@ -215,24 +248,26 @@ def check_device(device): @tvm.testing.requires_gpu def test_add(): + """Test addition.""" + def run(dtype): # graph - n = te.size_var("n") - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.placeholder((n,), name="B", dtype=dtype) - bias = te.var("bias", dtype=dtype) - scale = te.var("scale", dtype=dtype) - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") + size_var_n = te.size_var("n") + placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype) + placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype) + result_c = te.compute( + placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C" + ) # schedule - s = te.create_schedule(C.op) + schedule = te.create_schedule(result_c.op) # create iter var and assign them tags. num_thread = 16 - bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4) - tx, x = s[C].split(x, nparts=num_thread) - _, x = s[C].split(x, factor=4) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - s[C].vectorize(x) + axis_bx, axis_x = schedule[result_c].split(result_c.op.axis[0], factor=num_thread * 4) + axis_tx, axis_x = schedule[result_c].split(axis_x, nparts=num_thread) + _, axis_x = schedule[result_c].split(axis_x, factor=4) + schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis_tx, te.thread_axis("threadIdx.x")) + schedule[result_c].vectorize(axis_x) # one line to build the function. def check_device(device): @@ -240,16 +275,22 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fadd = tvm.build(s, [A, B, C], device, name="myadd") + fadd = tvm.build( + schedule, [placeholder_a, placeholder_b, result_c], device, name="myadd" + ) # launch the kernel. n = 1024 - a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev) - b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + buff_a = tvm.nd.array( + (np.random.uniform(size=n) * 256).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array( + (np.random.uniform(size=n) * 256).astype(placeholder_b.dtype), dev + ) + buff_c = tvm.nd.array(np.zeros(n, dtype=result_c.dtype), dev) ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1) - tcost = ftimer(a, b, c).mean - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy(), rtol=1e-6) + _ = ftimer(buff_a, buff_b, buff_c).mean + tvm.testing.assert_allclose(buff_c.numpy(), buff_a.numpy() + buff_b.numpy(), rtol=1e-6) check_device("opencl") check_device("cuda") @@ -265,25 +306,26 @@ def check_device(device): @tvm.testing.requires_gpu def try_warp_memory(): - """skip this in default test because it require higher arch""" - m = 128 - A = te.placeholder((m,), name="A") - B = te.compute((m,), lambda i: A[i] + 3, name="B") + """Test using warp memory + skip this in default test because it require higher arch""" + arr_size = 128 + placeholder_a = te.placeholder((arr_size,), name="A") + result_b = te.compute((arr_size,), lambda i: placeholder_a[i] + 3, name="B") warp_size = 32 - s = te.create_schedule(B.op) - AA = s.cache_read(A, "warp", [B]) - xo, xi = s[B].split(B.op.axis[0], warp_size * 2) - xi0, xi1 = s[B].split(xi, factor=warp_size) - tx = te.thread_axis("threadIdx.x") - s[B].bind(xi1, tx) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[AA].compute_at(s[B], xo) - xo, xi = s[AA].split(s[AA].op.axis[0], warp_size) - s[AA].bind(xi, tx) + schedule = te.create_schedule(result_b.op) + cache_read_aa = schedule.cache_read(placeholder_a, "warp", [result_b]) + axis_x0, axis_xi = schedule[result_b].split(result_b.op.axis[0], warp_size * 2) + _, axis_xi1 = schedule[result_b].split(axis_xi, factor=warp_size) + thread_axis_tx = te.thread_axis("threadIdx.x") + schedule[result_b].bind(axis_xi1, thread_axis_tx) + schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x")) + schedule[cache_read_aa].compute_at(schedule[result_b], axis_x0) + axis_x0, axis_xi = schedule[cache_read_aa].split(schedule[cache_read_aa].op.axis[0], warp_size) + schedule[cache_read_aa].bind(axis_xi, thread_axis_tx) @tvm.register_func("tvm_callback_cuda_compile", override=True) - def tvm_callback_cuda_compile(code): - ptx = nvcc.compile_cuda(code, target_format="ptx") + def tvm_callback_cuda_compile(code): # pylint: disable=unused-variable + ptx = nvcc.compile_cuda(code) return ptx # one line to build the function. @@ -292,11 +334,13 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - f = tvm.build(s, [A, B], device) - a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) - f(a, b) - tvm.testing.assert_allclose(b.numpy(), a.numpy() + 3, rtol=1e-6) + myfunc = tvm.build(schedule, [placeholder_a, result_b], device) + buff_a = tvm.nd.array( + (np.random.uniform(size=arr_size) * 256).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(arr_size, dtype=result_b.dtype), dev) + myfunc(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), buff_a.numpy() + 3, rtol=1e-6) check_device("cuda") diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py index 6171c37b16725..7b247d7d527f2 100644 --- a/tests/python/integration/test_ewise_fpga.py +++ b/tests/python/integration/test_ewise_fpga.py @@ -14,11 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test elementwise ops on fpga.""" +import os + +import numpy as np + import tvm import tvm.testing from tvm import te -import numpy as np -import os os.environ["XCL_EMULATION_MODE"] = "1" os.environ["CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA"] = "1" @@ -32,28 +35,29 @@ def tvm_callback_vhls_postproc(code): def test_exp(): + """Test scheduling and running exp function.""" # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B") - s = te.create_schedule(B.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_b = te.placeholder((arr_length_tvm,), name="A") + result_b = te.compute(placeholder_b.shape, lambda *i: te.exp(placeholder_b(*i)), name="B") + schedule = te.create_schedule(result_b.op) # create iter var and assign them tags. - px, x = s[B].split(B.op.axis[0], nparts=1) - s[B].bind(px, te.thread_axis("pipeline")) + axis1, _ = schedule[result_b].split(result_b.op.axis[0], nparts=1) + schedule[result_b].bind(axis1, te.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.testing.device_enabled(device): return dev = tvm.device(device, 0) - fexp = tvm.build(s, [A, B], device, host, name="myexp") + fexp = tvm.build(schedule, [placeholder_b, result_b], device, host, name="myexp") dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) - fexp(a, b) - tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5) + buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev) + buff_b = tvm.nd.array(np.zeros(arr_length, dtype=result_b.dtype), dev) + fexp(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5) check_device("sdaccel") if "AWS_PLATFORM" in os.environ: @@ -63,34 +67,41 @@ def check_device(device, host="llvm"): def test_multi_kernel(): + """Test scheduling with multiple computes.""" # graph - n = tvm.runtime.convert(1024) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name="D") - s = te.create_schedule(D.op) + arr_length = 1024 + arr_length_tvm = tvm.runtime.convert(arr_length) + placeholder_a = te.placeholder((arr_length_tvm,), name="A") + placeholder_b = te.placeholder((arr_length_tvm,), name="B") + result_c = te.compute( + placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C" + ) + result_d = te.compute( + placeholder_a.shape, lambda *i: placeholder_a(*i) + result_c(*i), name="D" + ) + schedule = te.create_schedule(result_d.op) # create iter var and assign them tags. - px, x = s[C].split(C.op.axis[0], nparts=1) - s[C].bind(px, te.thread_axis("pipeline")) - px, x = s[D].split(D.op.axis[0], nparts=1) - s[D].bind(px, te.thread_axis("pipeline")) + axis1, _ = schedule[result_c].split(result_c.op.axis[0], nparts=1) + schedule[result_c].bind(axis1, te.thread_axis("pipeline")) + axis1, _ = schedule[result_d].split(result_d.op.axis[0], nparts=1) + schedule[result_d].bind(axis1, te.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.testing.device_enabled(device): return dev = tvm.device(device, 0) - fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd") + fadd = tvm.build( + schedule, [placeholder_a, placeholder_b, result_c, result_d], device, host, name="myadd" + ) dev = tvm.device(device, 0) # launch the kernel. - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev) - d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), dev) - fadd(a, b, c, d) - tvm.testing.assert_allclose(d.numpy(), a.numpy() * 2 + b.numpy(), rtol=1e-5) + buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev) + buff_c = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_c.dtype), dev) + buff_d = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_d.dtype), dev) + fadd(buff_a, buff_b, buff_c, buff_d) + tvm.testing.assert_allclose(buff_d.numpy(), buff_a.numpy() * 2 + buff_b.numpy(), rtol=1e-5) check_device("sdaccel") check_device("aocl_sw_emu") diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py index aa6c5a1e74e15..66d777989d8c8 100644 --- a/tests/python/integration/test_gemm.py +++ b/tests/python/integration/test_gemm.py @@ -14,27 +14,32 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import tvm -from tvm import te +"""Test scheduling and running a gemm!""" import numpy as np -import time + +import tvm import tvm.testing +from tvm import te @tvm.testing.requires_gpu def test_gemm(): + """Test the gemm!""" # graph - nn = 1024 - n = tvm.runtime.convert(nn) - m = n - l = n - A = te.placeholder((n, l), name="A") - B = te.placeholder((m, l), name="B") - k = te.reduce_axis((0, l), name="k") - C = te.compute((n, m), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC") + dim1_length = 1024 + dim_n = tvm.runtime.convert(dim1_length) + dim_m = dim_n + dim_l = dim_n + placeholder_a = te.placeholder((dim_n, dim_l), name="A") + placeholder_b = te.placeholder((dim_m, dim_l), name="B") + axis_k = te.reduce_axis((0, dim_l), name="k") + result_c = te.compute( + (dim_n, dim_m), + lambda ii, jj: te.sum(placeholder_a[ii, axis_k] * placeholder_b[jj, axis_k], axis=axis_k), + name="CC", + ) # schedule - s = te.create_schedule(C.op) - xtile, ytile = 32, 32 + schedule = te.create_schedule(result_c.op) scale = 8 num_thread = 8 block_factor = scale * num_thread @@ -43,39 +48,43 @@ def test_gemm(): block_y = te.thread_axis("blockIdx.y") thread_y = te.thread_axis("threadIdx.y") - CC = s.cache_write(C, "local") - AA = s.cache_read(A, "shared", [CC]) - BB = s.cache_read(B, "shared", [CC]) - by, yi = s[C].split(C.op.axis[0], factor=block_factor) - bx, xi = s[C].split(C.op.axis[1], factor=block_factor) - s[C].reorder(by, bx, yi, xi) - s[C].bind(by, block_y) - s[C].bind(bx, block_x) - ty, yi = s[C].split(yi, nparts=num_thread) - tx, xi = s[C].split(xi, nparts=num_thread) - s[C].reorder(ty, tx, yi, xi) - s[C].bind(ty, thread_y) - s[C].bind(tx, thread_x) - yo, xo = CC.op.axis - s[CC].reorder(k, yo, xo) + cache_write = schedule.cache_write(result_c, "local") + cache_read_a = schedule.cache_read(placeholder_a, "shared", [cache_write]) + cache_read_b = schedule.cache_read(placeholder_b, "shared", [cache_write]) + axis_by, axis_yi = schedule[result_c].split(result_c.op.axis[0], factor=block_factor) + axis_bx, axis_xi = schedule[result_c].split(result_c.op.axis[1], factor=block_factor) + schedule[result_c].reorder(axis_by, axis_bx, axis_yi, axis_xi) + schedule[result_c].bind(axis_by, block_y) + schedule[result_c].bind(axis_bx, block_x) + axis_ty, axis_yi = schedule[result_c].split(axis_yi, nparts=num_thread) + axis_tx, axis_xi = schedule[result_c].split(axis_xi, nparts=num_thread) + schedule[result_c].reorder(axis_ty, axis_tx, axis_yi, axis_xi) + schedule[result_c].bind(axis_ty, thread_y) + schedule[result_c].bind(axis_tx, thread_x) + axis_yo, axis_xo = cache_write.op.axis + schedule[cache_write].reorder(axis_k, axis_yo, axis_xo) - s[CC].compute_at(s[C], tx) - s[AA].compute_at(s[CC], k) - s[BB].compute_at(s[CC], k) - s[AA].double_buffer() - s[BB].double_buffer() - ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) - tx, xi = s[AA].split(xi, nparts=num_thread) - s[AA].bind(ty, thread_y) - s[AA].bind(tx, thread_x) + schedule[cache_write].compute_at(schedule[result_c], axis_tx) + schedule[cache_read_a].compute_at(schedule[cache_write], axis_k) + schedule[cache_read_b].compute_at(schedule[cache_write], axis_k) + schedule[cache_read_a].double_buffer() + schedule[cache_read_b].double_buffer() + axis_ty, axis_xi = schedule[cache_read_a].split( + schedule[cache_read_a].op.axis[0], nparts=num_thread + ) + axis_tx, axis_xi = schedule[cache_read_a].split(axis_xi, nparts=num_thread) + schedule[cache_read_a].bind(axis_ty, thread_y) + schedule[cache_read_a].bind(axis_tx, thread_x) - ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) - tx, xi = s[BB].split(xi, nparts=num_thread) - s[BB].bind(ty, thread_y) - s[BB].bind(tx, thread_x) + axis_ty, axis_xi = schedule[cache_read_b].split( + schedule[cache_read_b].op.axis[0], nparts=num_thread + ) + axis_tx, axis_xi = schedule[cache_read_b].split(axis_xi, nparts=num_thread) + schedule[cache_read_b].bind(axis_ty, thread_y) + schedule[cache_read_b].bind(axis_tx, thread_x) # lowering test - s = s.normalize() + schedule = schedule.normalize() # one line to build the function. def check_device(device): @@ -85,21 +94,21 @@ def check_device(device): return with tvm.target.Target(device): - f = tvm.build(s, [A, B, C]) + f = tvm.build(schedule, [placeholder_a, placeholder_b, result_c]) # launch the kernel. - n = nn - m = n - l = n - a_np = np.random.uniform(size=(n, l)).astype(A.dtype) - b_np = np.random.uniform(size=(m, l)).astype(B.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) + num_n = dim1_length + num_m = num_n + num_l = num_n + a_np = np.random.uniform(size=(num_n, num_l)).astype(placeholder_a.dtype) + b_np = np.random.uniform(size=(num_m, num_l)).astype(placeholder_b.dtype) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros((num_n, num_m), dtype=result_c.dtype), dev) ftimer = f.time_evaluator(f.entry_name, dev, number=1) - tcost = ftimer(a, b, c).mean + tcost = ftimer(buff_a, buff_b, buff_c).mean print("%s: exec=%g sec/op" % (dev, tcost)) - tvm.testing.assert_allclose(c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5) + tvm.testing.assert_allclose(buff_c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5) check_device("vulkan") check_device("nvptx -mcpu=sm_20") diff --git a/tests/python/integration/test_lower.py b/tests/python/integration/test_lower.py index 63733b05ab3fa..1ccdde8b13374 100644 --- a/tests/python/integration/test_lower.py +++ b/tests/python/integration/test_lower.py @@ -14,42 +14,52 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument -"""Test workload for lowering and build""" +"""Test workload for lowering and build.""" +import numpy as np + import tvm -from tvm import tir -from tvm.script import tir as T import tvm.testing -import numpy as np +from tvm.script import tir as T @T.prim_func -def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: +def tensorcore_gemm(handle_a: T.handle, handle_b: T.handle, handle_c: T.handle) -> None: + # pylint: disable=missing-function-docstring # match buffer - A = T.match_buffer(a, [1024, 1024], "float16") - B = T.match_buffer(b, [1024, 1024], "float16") - C = T.match_buffer(c, [1024, 1024], "float32") + match_buffer_a = T.match_buffer(handle_a, [1024, 1024], "float16") + match_buffer_b = T.match_buffer(handle_b, [1024, 1024], "float16") + match_buffer_c = T.match_buffer(handle_c, [1024, 1024], "float32") # body - for blockIdx_x in T.thread_binding(0, 16, "blockIdx.x"): - for blockIdx_y in T.thread_binding(0, 8, "blockIdx.y"): + for block_idx_x in T.thread_binding(0, 16, "blockIdx.x"): + for block_idx_y in T.thread_binding(0, 8, "blockIdx.y"): with T.block(): - bx, by = T.axis.remap("SS", [blockIdx_x, blockIdx_y]) - shared_A = T.alloc_buffer([1024, 1024], "float16", scope="shared") - shared_B = T.alloc_buffer([1024, 1024], "float16", scope="shared") - wmma_A = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a") - wmma_B = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b") - wmma_C = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator") - for ty in T.thread_binding(0, 2, "threadIdx.y"): - for tz in T.thread_binding(0, 2, "threadIdx.z"): - for i, j in T.grid(2, 4): + axis_bx, axis_by = T.axis.remap("SS", [block_idx_x, block_idx_y]) + shared_a = T.alloc_buffer([1024, 1024], "float16", scope="shared") + shared_b = T.alloc_buffer([1024, 1024], "float16", scope="shared") + wmma_a = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a") + wmma_b = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b") + wmma_c = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator") + + # pylint: disable=too-many-nested-blocks + for thread_ty in T.thread_binding(0, 2, "threadIdx.y"): + for thread_tz in T.thread_binding(0, 2, "threadIdx.z"): + for index_i, index_jj in T.grid(2, 4): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vj = T.axis.S(64, by * 8 + tz * 4 + j) + new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i) + new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj) T.reads([]) - T.writes(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) - C0 = T.match_buffer( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + T.writes( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] + ) + match_buffer_c0 = T.match_buffer( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", strides=[16 * 4, 1], @@ -58,62 +68,92 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_fill_fragment( - C0.data, + match_buffer_c0.data, 16, 16, 16, - i * 4 + j, - T.float32(0), + index_i * 4 + index_jj, + T.float32(0), # pylint: disable=not-callable dtype="handle", ) ) - for ko in range(0, 32): + for k_o in range(0, 32): # copy data from global to shared - for tx in T.thread_binding(0, 32, "threadIdx.x"): - for i0, j0 in T.grid(1, 4): - for j1 in T.vectorized(0, 4): + for thread_tx in T.thread_binding(0, 32, "threadIdx.x"): + for index_i0, index_j0 in T.grid(1, 4): + for index_j1 in T.vectorized(0, 4): with T.block(): - vi = T.axis.S(1024, bx * 64 + ty * 32 + tx + i0) - vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1) - shared_A[vi, vj + 8] = A[vi, vj] + new_axis_vi = T.axis.S( + 1024, + axis_bx * 64 + + thread_ty * 32 + + thread_tx + + index_i0, + ) + new_axis_vj = T.axis.S( + 1024, + k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1, + ) + shared_a[new_axis_vi, new_axis_vj + 8] = match_buffer_a[ + new_axis_vi, new_axis_vj + ] - for i0, j0 in T.grid(2, 4): - for j1 in T.vectorized(0, 4): + for index_i0, index_j0 in T.grid(2, 4): + for index_j1 in T.vectorized(0, 4): with T.block(): - vi = T.axis.S(1024, by * 128 + ty * 64 + tx * 2 + i0) - vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1) - shared_B[vi, vj + 8] = B[vi, vj] + new_axis_vi = T.axis.S( + 1024, + axis_by * 128 + + thread_ty * 64 + + thread_tx * 2 + + index_i0, + ) + new_axis_vj = T.axis.S( + 1024, + k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1, + ) + shared_b[new_axis_vi, new_axis_vj + 8] = match_buffer_b[ + new_axis_vi, new_axis_vj + ] - for ki in range(0, 2): - for i in range(0, 2): + for k_i in range(0, 2): + for index_i in range(0, 2): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vk = T.axis.S(64, ko * 2 + ki) + new_axis_vi = T.axis.S( + 64, axis_bx * 4 + thread_ty * 2 + index_i + ) + axis_vk = T.axis.S(64, k_o * 2 + k_i) T.reads( - shared_A[ - vi * 16 : vi * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + shared_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ] ) T.writes( - wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16] + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ] ) - s0 = T.var("int32") - s1 = T.var("int32") - A0 = T.match_buffer( - shared_A[ - vi * 16 : vi * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + stride0 = T.var("int32") + stride1 = T.var("int32") + match_buffer_a0 = T.match_buffer( + shared_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ], (16, 16 + 8), "float16", - strides=[s0, s1], + strides=[stride0, stride1], scope="shared", offset_factor=1, ) - wmma_A0 = T.match_buffer( - wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_a0 = T.match_buffer( + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], @@ -122,52 +162,60 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_load_matrix_sync( - wmma_A0.data, + wmma_a0.data, 16, 16, 16, - i, + index_i, T.tvm_access_ptr( T.type_annotation(dtype="float16"), - A0.data, - A0.elem_offset + 8, - A0.strides[0], + match_buffer_a0.data, + match_buffer_a0.elem_offset + 8, + match_buffer_a0.strides[0], 1, dtype="handle", ), - A0.strides[0], + match_buffer_a0.strides[0], "row_major", dtype="handle", ) ) - for j in range(0, 4): + for index_jj in range(0, 4): with T.block(): - vj = T.axis.S(64, by * 8 + tz * 4 + j) - vk = T.axis.S(64, ko * 2 + ki) + new_axis_vj = T.axis.S( + 64, axis_by * 8 + thread_tz * 4 + index_jj + ) + axis_vk = T.axis.S(64, k_o * 2 + k_i) T.reads( - shared_B[ - vj * 16 : vj * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + shared_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ] ) T.writes( - wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16] + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ] ) - s0 = T.var("int32") - s1 = T.var("int32") - B0 = T.match_buffer( - shared_B[ - vj * 16 : vj * 16 + 16, - vk * 16 : vk * 16 + 16 + 8, + stride0 = T.var("int32") + stride1 = T.var("int32") + match_buffer_b0 = T.match_buffer( + shared_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16 + 8, ], (16, 16 + 8), "float16", - strides=[s0, s1], + strides=[stride0, stride1], scope="shared", offset_factor=1, ) - wmma_B0 = T.match_buffer( - wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_b0 = T.match_buffer( + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], @@ -176,63 +224,82 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_load_matrix_sync( - wmma_B0.data, + wmma_b0.data, 16, 16, 16, - j, + index_jj, T.tvm_access_ptr( T.type_annotation(dtype="float16"), - B0.data, - B0.elem_offset + 8, - B0.strides[0], + match_buffer_b0.data, + match_buffer_b0.elem_offset + 8, + match_buffer_b0.strides[0], 1, dtype="handle", ), - B0.strides[0], + match_buffer_b0.strides[0], "col_major", dtype="handle", ) ) - for i, j in T.grid(2, 4): + for index_i, index_jj in T.grid(2, 4): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vj = T.axis.S(64, by * 8 + tz * 4 + j) - vk = T.axis.R(64, ko * 2 + ki) + new_axis_vi = T.axis.S( + 64, axis_bx * 4 + thread_ty * 2 + index_i + ) + new_axis_vj = T.axis.S( + 64, axis_by * 8 + thread_tz * 4 + index_jj + ) + axis_vk = T.axis.R(64, k_o * 2 + k_i) T.reads( [ - wmma_A[ - vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16 + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, ], - wmma_B[ - vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16 + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, ], - wmma_C[ - vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16 + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, ], ] ) T.writes( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16] + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] ) - wmma_A1 = T.match_buffer( - wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_a1 = T.match_buffer( + wmma_a[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], scope="wmma.matrix_a", offset_factor=1, ) - wmma_B1 = T.match_buffer( - wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16], + wmma_b1 = T.match_buffer( + wmma_b[ + new_axis_vj * 16 : new_axis_vj * 16 + 16, + axis_vk * 16 : axis_vk * 16 + 16, + ], (16, 16), "float16", strides=[16, 1], scope="wmma.matrix_b", offset_factor=1, ) - wmma_C1 = T.match_buffer( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + wmma_c1 = T.match_buffer( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", strides=[16 * 4, 1], @@ -241,56 +308,72 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: ) T.evaluate( T.tvm_mma_sync( - wmma_C1.data, - i * 4 + j, - wmma_A1.data, - i, - wmma_B1.data, - j, - wmma_C1.data, - i * 4 + j, + wmma_c1.data, + index_i * 4 + index_jj, + wmma_a1.data, + index_i, + wmma_b1.data, + index_jj, + wmma_c1.data, + index_i * 4 + index_jj, dtype="handle", ) ) - for i, j in T.grid(2, 4): + for index_i, index_jj in T.grid(2, 4): with T.block(): - vi = T.axis.S(64, bx * 4 + ty * 2 + i) - vj = T.axis.S(64, by * 8 + tz * 4 + j) - T.reads(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) - T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) - s0 = T.var("int32") - s1 = T.var("int32") - wmma_C2 = T.match_buffer( - wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i) + new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj) + T.reads( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] + ) + T.writes( + match_buffer_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ] + ) + stride0 = T.var("int32") + stride1 = T.var("int32") + wmma_c2 = T.match_buffer( + wmma_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", strides=[16 * 4, 1], scope="wmma.accumulator", offset_factor=1, ) - C1 = T.match_buffer( - C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], + match_buffer_c1 = T.match_buffer( + match_buffer_c[ + new_axis_vi * 16 : new_axis_vi * 16 + 16, + new_axis_vj * 16 : new_axis_vj * 16 + 16, + ], (16, 16), "float32", - strides=[s0, s1], + strides=[stride0, stride1], offset_factor=1, ) T.evaluate( T.tvm_store_matrix_sync( - wmma_C2.data, + wmma_c2.data, 16, 16, 16, - i * 4 + j, + index_i * 4 + index_jj, T.tvm_access_ptr( T.type_annotation(dtype="float32"), - C1.data, - C1.elem_offset, - C1.strides[0], + match_buffer_c1.data, + match_buffer_c1.elem_offset, + match_buffer_c1.strides[0], 1, dtype="handle", ), - C1.strides[0], + match_buffer_c1.strides[0], "row_major", dtype="handle", ) @@ -299,22 +382,23 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None: @tvm.testing.requires_cuda def test_gemm_tensorcore(): + """Test running gemm on tensorcore.""" dev = tvm.device("cuda", 0) a_np = np.random.uniform(size=(1024, 1024)).astype("float16") b_np = np.random.uniform(size=(1024, 1024)).astype("float16") c_np = np.dot(a_np.astype("float32"), b_np.T.astype("float32")) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev) - f = tvm.build(tensorcore_gemm, target="cuda", name="dense") - f(a, b, c) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev) + myfunc = tvm.build(tensorcore_gemm, target="cuda", name="dense") + myfunc(buff_a, buff_b, buff_c) + tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-3) - evaluator = f.time_evaluator(f.entry_name, dev, number=100) - t = evaluator(a, b, c).mean + evaluator = myfunc.time_evaluator(myfunc.entry_name, dev, number=100) + time_elapsed = evaluator(buff_a, buff_b, buff_c).mean num_flops = 2 * 1024 * 1024 * 1024 - gflops = num_flops / (t * 1e3) / 1e6 - print("gemm with tensor core: %f ms" % (t * 1e3)) + gflops = num_flops / (time_elapsed * 1e3) / 1e6 + print("gemm with tensor core: %f ms" % (time_elapsed * 1e3)) print("GFLOPS: %f" % gflops) diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py index 511e75723b031..b855dc6fa09e9 100644 --- a/tests/python/integration/test_meta_schedule_auto_tensorize.py +++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py @@ -14,34 +14,32 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Integration test for metascheduler's auto tensorization.""" +import tempfile + +import numpy as np import pytest + import tvm -from tvm import relay import tvm.testing -import numpy as np -from tvm.meta_schedule.tune import tune_extracted_tasks +import tvm.topi.testing +from tvm import meta_schedule as ms +from tvm import relay +from tvm.meta_schedule import ApplyHistoryBest, postproc, schedule_rule from tvm.meta_schedule.relay_integration import extract_task_from_relay -from tvm.meta_schedule import ApplyHistoryBest -from tvm.meta_schedule import schedule_rule, postproc from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base -from tvm import meta_schedule as ms -from tvm.tir.tensor_intrin import ( - VNNI_DOT_16x4_INTRIN as VNNI_INTRIN, - DP4A_INTRIN, - AMDGPU_SDOT4_INTRIN, -) -import tempfile -import tvm.topi.testing - +from tvm.meta_schedule.tune import tune_extracted_tasks +from tvm.tir.tensor_intrin import AMDGPU_SDOT4_INTRIN, DP4A_INTRIN +from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN -config = ms.TuneConfig( +CONFIG = ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=32, max_trials_per_task=32, max_trials_global=20000, ) -sch_rules_for_vnni = [ +SCH_RULES_FOR_VNNI = [ schedule_rule.AutoInline( into_producer=False, into_consumer=True, @@ -113,17 +111,17 @@ def get_sch_rules_for_dp4a(intrin): ] -sch_rules_for_dp4a = get_sch_rules_for_dp4a(DP4A_INTRIN) -sch_rules_for_sdot4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN) +SCH_RULES_FOR_DP4A = get_sch_rules_for_dp4a(DP4A_INTRIN) +SCH_RULES_FOR_SDOT4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN) -postprocs_for_vnni = [ +POSTPROCS_FOR_VNNI = [ postproc.DisallowDynamicLoop(), postproc.RewriteParallelVectorizeUnroll(), postproc.RewriteReductionBlock(), postproc.RewriteTensorize(vectorize_init_loop=True), ] -postprocs_for_dp4a = [ +POSTPROCS_FOR_DP4A = [ postproc.DisallowDynamicLoop(), postproc.RewriteCooperativeFetch(), postproc.RewriteUnboundBlock(), @@ -135,6 +133,7 @@ def get_sch_rules_for_dp4a(intrin): def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs): + """Test tuning.""" tgt = "cuda" if "nvidia" in target else target dev = tvm.device(tgt, 0) @@ -158,7 +157,7 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos with tempfile.TemporaryDirectory() as work_dir: database = tune_extracted_tasks( tune_tasks, - config, + CONFIG, work_dir=work_dir, sch_rules=lambda: sch_rules, postprocs=lambda: postprocs, @@ -186,9 +185,9 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos def _test_dense(data_dtype, sch_rules, postprocs, target): - M, N, K = 1024, 1024, 1024 - data_shape = (M, K) - weight_shape = (N, K) + dim_m, dim_n, dim_k = 1024, 1024, 1024 + data_shape = (dim_m, dim_k) + weight_shape = (dim_n, dim_k) weight_dtype = "int8" out_dtype = "int32" @@ -255,7 +254,7 @@ def _test_bert_int8(target, sch_rules, postprocs): with tempfile.TemporaryDirectory() as work_dir: database = tune_extracted_tasks( tune_tasks, - config, + CONFIG, work_dir=work_dir, sch_rules=lambda: sch_rules, postprocs=lambda: postprocs, @@ -284,14 +283,14 @@ def _test_bert_int8(target, sch_rules, postprocs): @pytest.mark.skip("Requires cascadelake") def test_vnni_dense(): _test_dense( - "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4" + "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4" ) @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI") @tvm.testing.requires_gpu def test_dp4a_dense(): - _test_dense("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070") + _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070") # Uncomment to test on vulkan or rocm target # _test_dense( @@ -305,14 +304,14 @@ def test_dp4a_dense(): @pytest.mark.skip("Requires cascadelake") def test_vnni_conv2d(): _test_conv2d( - "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4" + "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4" ) @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI") @tvm.testing.requires_gpu def test_dp4a_conv2d(): - _test_conv2d("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070") + _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070") # Uncomment to test on vulkan or rocm target # _test_conv2d( @@ -325,13 +324,13 @@ def test_dp4a_conv2d(): @pytest.mark.skip("Requires cascadelake") def test_vnni_bert_int8(): - _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", sch_rules_for_vnni, postprocs_for_vnni) + _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI) @tvm.testing.requires_gpu @pytest.mark.skip("Slow on CI") def test_dp4a_bert_int8(): - _test_bert_int8("nvidia/geforce-rtx-3070", sch_rules_for_dp4a, postprocs_for_dp4a) + _test_bert_int8("nvidia/geforce-rtx-3070", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A) # Uncomment to test on vulkan or rocm target # _test_bert_int8("vulkan -from_device=0", sch_rules_for_dp4a, postprocs_for_dp4a) diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index f3886374ccb65..eaac8ed266841 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test scheduling of reduction operations.""" import pytest import numpy as np @@ -26,22 +27,28 @@ @tvm.testing.requires_gpu def test_reduce_prims(): + """Test reduction operations.""" + def test_prim(reducer, np_reducer): # graph - n = tvm.te.size_var("n") - m = tvm.te.size_var("m") - A = te.placeholder((n, m), name="A") - R = te.compute((n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R") - k = te.reduce_axis((0, m)) - B = te.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i] == 1)), name="B") + size_var_n = tvm.te.size_var("n") + size_var_m = tvm.te.size_var("m") + placeholder_a = te.placeholder((size_var_n, size_var_m), name="A") + result_r = te.compute((size_var_n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R") + axis_k = te.reduce_axis((0, size_var_m)) + result_b = te.compute( + (size_var_n,), + lambda i: reducer(placeholder_a[i, axis_k], axis=axis_k, where=(result_r[i] == 1)), + name="B", + ) # schedule - s = te.create_schedule(B.op) + schedule = te.create_schedule(result_b.op) # create iter var and assign them tags. num_thread = 1 - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - s[R].compute_inline() + axis_x0, axis_x1 = schedule[result_b].split(result_b.op.axis[0], factor=num_thread) + schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x")) + schedule[result_b].bind(axis_x1, te.thread_axis("threadIdx.x")) + schedule[result_r].compute_inline() # one line to build the function. def check_device(device, host="llvm"): @@ -50,17 +57,22 @@ def check_device(device, host="llvm"): print("skip because %s is not enabled.." % device) return freduce = tvm.build( - s, args=[A, B], target=tvm.target.Target(device, host), name="myreduce" + schedule, + args=[placeholder_a, result_b], + target=tvm.target.Target(device, host), + name="myreduce", ) # launch the kernel. - n = 1028 - m = 129 - x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev) - y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) - freduce(x, y) - npy = y.numpy() + num_n = 1028 + num_m = 129 + buff_x = tvm.nd.array( + np.random.uniform(size=(num_n, num_m)).astype(placeholder_a.dtype), dev + ) + buff_y = tvm.nd.array(np.zeros(num_n, dtype=result_b.dtype), dev) + freduce(buff_x, buff_y) + npy = buff_y.numpy() npy[:2] = 0 - res = np_reducer(x.numpy(), axis=1) + res = np_reducer(buff_x.numpy(), axis=1) res[:2] = 0 tvm.testing.assert_allclose(npy, res, rtol=1e-4) @@ -76,192 +88,228 @@ def check_device(device, host="llvm"): def test_init_imm(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((), lambda: te.sum(A[k], axis=k, init=10.0), name="B") + """Test initial values which are immutable in reduction ops.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length,), name="A") + axis_k = te.reduce_axis((0, arr_length)) + result_b = te.compute( + (), lambda: te.sum(placeholder_a[axis_k], axis=axis_k, init=10.0), name="B" + ) # schedule - s = te.create_schedule(B.op) + schedule_s = te.create_schedule(result_b.op) # one line to build the function. def check_target(target="llvm"): if not tvm.runtime.enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule_s, args=[placeholder_a, result_b]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) - fsum(a, b) - res = 10.0 + np.sum(a.numpy(), axis=0) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros((), dtype=result_b.dtype), dev) + fsum(buff_a, buff_b) + res = 10.0 + np.sum(buff_a.numpy(), axis=0) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() def test_init(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n, n), name="A") - C = te.placeholder((n, n), name="C") - I = te.placeholder((n, n), name="I") - k = te.reduce_axis((0, n)) - B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B") + """Test initializer which is non-const.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length, arr_length), name="A") + placeholder_c = te.placeholder((arr_length, arr_length), name="C") + placeholder_i = te.placeholder((arr_length, arr_length), name="I") + axis_k = te.reduce_axis((0, arr_length)) + result_b = te.compute( + (arr_length, arr_length), + lambda i, j: te.sum( + placeholder_a[i, axis_k] * placeholder_c[axis_k, j], + axis=axis_k, + init=placeholder_i[i, j], + ), + name="B", + ) # schedule - s = te.create_schedule(B.op) + schedule = te.create_schedule(result_b.op) # one line to build the function. def check_target(target="llvm"): if not tvm.runtime.enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, C, I, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b]) print(fapi) mmult = tvm.build(fapi, target=target, name="mmult") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev) - ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev) - b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev) - mmult(a, c, ii, b) - res = ii.numpy() + np.matmul(a.numpy(), c.numpy()) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev + ) + buff_c = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev + ) + buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev) + buf_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev) + mmult(buff_a, buff_c, buff_i, buf_b) + res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy()) + tvm.testing.assert_allclose(buf_b.numpy(), res, rtol=1e-4) check_target() def test_rfactor(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((), lambda: te.sum(A[k], axis=k), name="B") + """Test rfactors.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length,), name="A") + axis_k = te.reduce_axis((0, arr_length)) + placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B") # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf) - s[BF].parallel(BF.op.axis[0]) + schedule = te.create_schedule(placeholder_b.op) + axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4) + rfactor_bf = schedule.rfactor(placeholder_b, axis_kf) + schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.testing.device_enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=0) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=0) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() def test_rfactor_init(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n, n), name="A") - C = te.placeholder((n, n), name="C") - I = te.placeholder((n, n), name="I") - k = te.reduce_axis((0, n)) - B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B") + """Test rfactors with constant inits.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length, arr_length), name="A") + placeholder_c = te.placeholder((arr_length, arr_length), name="C") + placeholder_i = te.placeholder((arr_length, arr_length), name="I") + axis_k = te.reduce_axis((0, arr_length)) + result_b = te.compute( + (arr_length, arr_length), + lambda i, j: te.sum( + placeholder_a[i, axis_k] * placeholder_c[axis_k, j], + axis=axis_k, + init=placeholder_i[i, j], + ), + name="B", + ) # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf, 1) - s[BF].parallel(BF.op.axis[0]) + schedule = te.create_schedule(result_b.op) + axis_kf, _ = schedule[result_b].split(axis_k, nparts=4) + rfactor_bf = schedule.rfactor(result_b, axis_kf, 1) + schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.runtime.enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, C, I, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b]) print(fapi) mmult = tvm.build(fapi, target=target, name="mmult") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev) - ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev) - b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev) - mmult(a, c, ii, b) - res = ii.numpy() + np.matmul(a.numpy(), c.numpy()) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev + ) + buff_c = tvm.nd.array( + np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev + ) + buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev) + buff_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev) + mmult(buff_a, buff_c, buff_i, buff_b) + res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy()) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() def test_rfactor_factor_axis(): - n = tvm.runtime.convert(1027) - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((), lambda: te.sum(A[k], axis=k), name="B") + """Test rfactors across axis.""" + num_n = 1027 + arr_length = tvm.runtime.convert(num_n) + placeholder_a = te.placeholder((arr_length,), name="A") + axis_k = te.reduce_axis((0, arr_length)) + placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B") # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf, 0) - s[BF].parallel(BF.op.axis[0]) + schedule = te.create_schedule(placeholder_b.op) + axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4) + rfactor_bf = schedule.rfactor(placeholder_b, axis_kf, 0) + schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0]) # one line to build the function. def check_target(target="llvm"): if not tvm.testing.device_enabled(target): return dev = tvm.cpu(0) - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b]) fsum = tvm.build(fapi, target=target, name="mysum") # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=0) - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev) + buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=0) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target() @tvm.testing.requires_gpu def test_rfactor_threads(): - nn = 1027 - mm = 10 - n = tvm.runtime.convert(nn) - m = tvm.runtime.convert(mm) - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n)) + """Test rfactors across threads.""" + num_n = 1027 + num_m = 10 + length_n = tvm.runtime.convert(num_n) + length_m = tvm.runtime.convert(num_m) + placeholder_a = te.placeholder((length_m, length_n), name="A") + axis_k = te.reduce_axis((0, length_n)) nthread = 16 - B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k, where=(i > 1)), name="B") + result_b = te.compute( + (length_m,), + lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k, where=(i > 1)), + name="B", + ) # schedule - s = te.create_schedule(B.op) - ko, kf = s[B].split(k, factor=nthread) - BF = s.rfactor(B, kf) - bx, ty = s[B].split(s[B].op.axis[0], factor=nthread) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(ty, te.thread_axis("threadIdx.y")) - tx = s[B].op.reduce_axis[0] + schedule = te.create_schedule(result_b.op) + _, axis_kf = schedule[result_b].split(axis_k, factor=nthread) + rfactor_bf = schedule.rfactor(result_b, axis_kf) + axis_bx, axis_ty = schedule[result_b].split(schedule[result_b].op.axis[0], factor=nthread) + schedule[result_b].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_b].bind(axis_ty, te.thread_axis("threadIdx.y")) + axis_tx = schedule[result_b].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") - s[B].bind(tx, thread_x) - s[BF].compute_at(s[B], tx) - s[B].set_store_predicate(thread_x.var.equal(0)) + schedule[result_b].bind(axis_tx, thread_x) + schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx) + schedule[result_b].set_store_predicate(thread_x.var.equal(0)) # one line to build the function. - def check_target(device, host="stackvm"): + def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fapi = tvm.lower(s, args=[A, B]) + fapi = tvm.lower(schedule, args=[placeholder_a, result_b]) fsum = tvm.build(fapi, target=device, name="mysum") # launch the kernel. - n = nn - m = mm - a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=1) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=1) res[:2] = 0 - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target("vulkan") check_target("cuda") @@ -272,46 +320,51 @@ def check_target(device, host="stackvm"): @tvm.testing.requires_gpu def test_rfactor_elemwise_threads(): - n = 1025 - m = 10 - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n)) + """Test rfactor elemwise threads.""" + num_n = 1025 + num_m = 10 + placeholder_a = te.placeholder((num_m, num_n), name="A") + axis_k = te.reduce_axis((0, num_n)) nthread = 16 - B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k), name="B") - BB = te.compute((m,), lambda i: B[i] + 1, name="BB") - C = te.compute((m,), lambda i: BB[i] + 1, name="C") + result_b = te.compute( + (num_m,), lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k), name="B" + ) + result_bb = te.compute((num_m,), lambda i: result_b[i] + 1, name="BB") + result_c = te.compute((num_m,), lambda i: result_bb[i] + 1, name="C") # schedule - s = te.create_schedule(C.op) - s[BB].compute_inline() - bx, ty = s[C].split(s[C].op.axis[0], factor=nthread) - ko, kf = s[B].split(k, factor=nthread) - BF = s.rfactor(B, kf) - s[B].compute_at(s[C], ty) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(ty, te.thread_axis("threadIdx.y")) - tx = s[B].op.reduce_axis[0] + schedule = te.create_schedule(result_c.op) + schedule[result_bb].compute_inline() + axis_bx, axis_ty = schedule[result_c].split(schedule[result_c].op.axis[0], factor=nthread) + _, axis_kf = schedule[result_b].split(axis_k, factor=nthread) + rfactor_bf = schedule.rfactor(result_b, axis_kf) + schedule[result_b].compute_at(schedule[result_c], axis_ty) + schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_c].bind(axis_ty, te.thread_axis("threadIdx.y")) + axis_tx = schedule[result_b].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") - s[B].bind(tx, thread_x) - s[BF].compute_at(s[B], tx) + schedule[result_b].bind(axis_tx, thread_x) + schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx) # Since thread_x is shared across reductions # only one of them need to do write back - s[B].set_store_predicate(thread_x.var.equal(0)) - s[C].set_store_predicate(thread_x.var.equal(0)) + schedule[result_b].set_store_predicate(thread_x.var.equal(0)) + schedule[result_c].set_store_predicate(thread_x.var.equal(0)) # one line to build the function. - def check_target(device, host="stackvm"): + def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fapi = tvm.lower(s, args=[A, C]) + fapi = tvm.lower(schedule, args=[placeholder_a, result_c]) fsum = tvm.build(fapi, target=device, name="mysum") # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev) - fsum(a, b) - res = np.sum(a.numpy(), axis=1) + 2 - tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4) + buff_a = tvm.nd.array( + np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev + ) + buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev) + fsum(buff_a, buff_b) + res = np.sum(buff_a.numpy(), axis=1) + 2 + tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4) check_target("vulkan") check_target("cuda") @@ -321,22 +374,26 @@ def check_target(device, host="stackvm"): def test_argmax(): - def fcombine(x, y): - lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) - rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) + """Test argmax.""" + + def fcombine(tensor_x, tensor_y): + lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0]) + rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1]) return lhs, rhs - def fidentity(t0, t1): - return tvm.tir.const(-1, t0), tvm.te.min_value(t1) + def fidentity(tensor1, tensor2): + return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") - m = te.size_var("m") - n = te.size_var("n") - idx = te.placeholder((m, n), name="idx", dtype="int32") - val = te.placeholder((m, n), name="val", dtype="float32") - k = te.reduce_axis((0, n), "k") - T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T") - s = te.create_schedule(T0.op) + size_var_m = te.size_var("m") + size_var_n = te.size_var("n") + idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32") + val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32") + axis_k = te.reduce_axis((0, size_var_n), "k") + result_t0, result_t1 = te.compute( + (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T" + ) + schedule = te.create_schedule(result_t0.op) def check_target(): device = "cpu" @@ -344,19 +401,19 @@ def check_target(): print("skip because %s is not enabled.." % device) return dev = tvm.device(device, 0) - fapi = tvm.lower(s, args=[idx, val, T0, T1]) + fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1]) fargmax = tvm.build(fapi, target="llvm", name="argmax") - mm = 12 - nn = 16 - np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) - np_val = np.random.uniform(size=(mm, nn)).astype("float32") + height = 12 + width = 16 + np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0) + np_val = np.random.uniform(size=(height, width)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) - nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) - nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) + nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev) + nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) @@ -365,55 +422,63 @@ def check_target(): @tvm.testing.requires_gpu def test_rfactor_argmax(): - def fcombine(x, y): - lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) - rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) + """Test rfactor argmax""" + + def fcombine(tensor0, tensor1): + lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0]) + rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1]) return lhs, rhs - def fidentity(t0, t1): - return tvm.tir.const(-1, t0), tvm.te.min_value(t1) + def fidentity(tensor0, tensor1): + return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1) argmax = te.comm_reducer(fcombine, fidentity, name="argmax") - nn = 1027 - mm = 10 - n = tvm.runtime.convert(nn) - m = tvm.runtime.convert(mm) - A0 = te.placeholder((m, n), name="A0", dtype="int32") - A1 = te.placeholder((m, n), name="A1", dtype="float32") - k = te.reduce_axis((0, n)) - B0, B1 = te.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name="B") + num_width = 1027 + num_height = 10 + width = tvm.runtime.convert(num_width) + height = tvm.runtime.convert(num_height) + placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32") + placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32") + axis_k = te.reduce_axis((0, width)) + result_b0, result_b1 = te.compute( + (height,), + lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k), + name="B", + ) # schedule - s = te.create_schedule(B0.op) + schedule = te.create_schedule(result_b0.op) nthread = 16 - ko, kf = s[B0].split(k, factor=nthread) - BF0, BF1 = s.rfactor(B0, kf) - bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread) - s[B0].bind(bx, te.thread_axis("blockIdx.x")) - s[B0].bind(ty, te.thread_axis("threadIdx.y")) - tx = s[B0].op.reduce_axis[0] + _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread) + rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf) + axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread) + schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y")) + axis_tx = schedule[result_b0].op.reduce_axis[0] thread_x = te.thread_axis("threadIdx.x") - s[B0].bind(tx, thread_x) - s[BF0.op].compute_at(s[B0], tx) - s[B0].set_store_predicate(thread_x.var.equal(0)) + schedule[result_b0].bind(axis_tx, thread_x) + schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx) + schedule[result_b0].set_store_predicate(thread_x.var.equal(0)) def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fapi = tvm.lower(s, args=[A0, A1, B0, B1]) + fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1]) fargmax = tvm.build(fapi, target=device, name="argmax") - np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) - np_val = np.random.uniform(size=(mm, nn)).astype("float32") + np_idx = np.repeat( + np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0 + ) + np_val = np.random.uniform(size=(num_height, num_width)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) - nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) - nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) + nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev) + nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy()) @@ -424,6 +489,7 @@ def check_target(device): @tvm.testing.requires_gpu def test_warp_reduction1(): + """Test warp reductions.""" nthx = 32 nthy = 4 block_x = te.thread_axis("blockIdx.x") @@ -437,30 +503,34 @@ def check_target(device, m, n): return # compute - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((m,), lambda i: te.max(A[i][k], axis=k), name="B") - s = te.create_schedule(B.op) + placeholder_a = te.placeholder((m, n), name="A") + axis_k = te.reduce_axis((0, n)) + placeholder_b = te.compute( + (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B" + ) + schedule = te.create_schedule(placeholder_b.op) # schedule - k = s[B].op.reduce_axis[0] - ko, _ = s[B].split(k, nparts=nthx) - s[B].bind(ko, thread_x) - xo, xi = s[B].split(s[B].op.axis[0], factor=nthy) - s[B].bind(xi, thread_y) - s[B].bind(xo, block_x) + axis_k = schedule[placeholder_b].op.reduce_axis[0] + axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx) + schedule[placeholder_b].bind(axis_ko, thread_x) + axis_xo, axis_xi = schedule[placeholder_b].split( + schedule[placeholder_b].op.axis[0], factor=nthy + ) + schedule[placeholder_b].bind(axis_xi, thread_y) + schedule[placeholder_b].bind(axis_xo, block_x) - tvm.lower(s, [A, B], simple_mode=True) + tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True) # validation - func = tvm.build(s, [A, B], device, name="warp_reduction") - a_np = np.random.uniform(size=(m, n)).astype(A.dtype) - b_np = np.zeros((m,), dtype=A.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) + func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction") + a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype) + b_np = np.zeros((m,), dtype=placeholder_a.dtype) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(b_np, dev) b_np = np.max(a_np, axis=1) - func(a, b) - tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3) + func(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3) check_target("cuda", m=32, n=256) check_target("cuda", m=10, n=20) @@ -472,21 +542,29 @@ def check_target(device, m, n): @tvm.testing.requires_gpu def test_warp_reduction2(): - def fcombine(x, y): - return x[0] + y[0], x[1] * y[1] + """Test warp reductions.""" + + def fcombine(tensor1, tensor2): + return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1] - def fidentity(t0, t1): - return tvm.tir.const(0, t0), tvm.tir.const(1, t1) + def fidentity(tensor1, tensor2): + return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2) add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer") # compute - m = 16 - n = 256 - A0 = te.placeholder((m, n), name="A0", dtype="float32") - A1 = te.placeholder((m, n), name="Al", dtype="float32") - k = te.reduce_axis((0, n), "k") - T0, T1 = te.compute((m,), lambda i: add_mul_reducer((A0[i, k], A1[i, k]), axis=k), name="T") + num_m = 16 + num_n = 256 + placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32") + placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32") + axis_k = te.reduce_axis((0, num_n), "k") + result0, result1 = te.compute( + (num_m,), + lambda i: add_mul_reducer( + (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k + ), + name="T", + ) nthdx, nthdy = 32, 2 block_x = te.thread_axis("blockIdx.x") @@ -500,29 +578,31 @@ def check_target(device): return # schedule - s = te.create_schedule(T0.op) - ko, _ = s[T0].split(k, nparts=nthdx) - xo, xi = s[T0].split(s[T0].op.axis[0], factor=nthdy) - s[T0].bind(ko, thread_x) - s[T0].bind(xi, thread_y) - s[T0].bind(xo, block_x) + schedule = te.create_schedule(result0.op) + axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx) + axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy) + schedule[result0].bind(axis_ko, thread_x) + schedule[result0].bind(axis_xi, thread_y) + schedule[result0].bind(axis_xo, block_x) # validation dev = tvm.device(device, 0) - a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype) - a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype) - t0_np = np.zeros((m,), dtype=A0.dtype) - t1_np = np.zeros((m,), dtype=A1.dtype) - a0 = tvm.nd.array(a0_np, dev) - a1 = tvm.nd.array(a1_np, dev) - t0 = tvm.nd.array(t0_np, dev) - t1 = tvm.nd.array(t1_np, dev) - func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction") - func(a0, a1, t0, t1) + a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype) + a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype) + t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype) + t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype) + buff_a0 = tvm.nd.array(a0_np, dev) + buff_a1 = tvm.nd.array(a1_np, dev) + buff_t0 = tvm.nd.array(t0_np, dev) + buff_t1 = tvm.nd.array(t1_np, dev) + func = tvm.build( + schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction" + ) + func(buff_a0, buff_a1, buff_t0, buff_t1) t0_np = np.sum(a0_np, axis=1) t1_np = np.product(a1_np, axis=1) - tvm.testing.assert_allclose(t0.numpy(), t0_np, rtol=1e-3, atol=1e-3) - tvm.testing.assert_allclose(t1.numpy(), t1_np, rtol=1e-3, atol=1e-3) + tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3) + tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3) check_target("cuda") check_target("rocm") @@ -530,6 +610,7 @@ def check_target(device): @tvm.testing.requires_cuda def test_reduce_storage_reuse(): + """Test reduction reuses storage.""" target = tvm.target.Target("cuda") def run_passes(sch, args): @@ -547,13 +628,13 @@ def run_passes(sch, args): dev = tvm.device(target.kind.name, 0) shape = (16, 16) - A = te.placeholder(shape, dtype="float32", name="A") - B = topi.nn.softmax(A, axis=1) + 1.0 + placeholder_a = te.placeholder(shape, dtype="float32", name="A") + placeholder_b = topi.nn.softmax(placeholder_a, axis=1) + 1.0 with tvm.target.Target(target): - s = topi.cuda.schedule_softmax(B) + schedule = topi.cuda.schedule_softmax(placeholder_b) - mod = run_passes(s, [A, B]) + mod = run_passes(schedule, [placeholder_a, placeholder_b]) # Due to the storage rewrite pass, the reduction output storage reduce_temp0 can be reused as # the storage of the next compute. @@ -586,12 +667,12 @@ def check_store_dst_remapped(op): inp = np.random.uniform(size=shape).astype("float32") ref = tvm.topi.testing.softmax_python(inp) + 1.0 - f = tvm.build(s, [A, B], target) - a = tvm.nd.array(inp, dev) - b = tvm.nd.array(np.zeros(shape, dtype=B.dtype), dev) - f(a, b) - tvm.testing.assert_allclose(b.numpy(), ref, rtol=1e-5) + func = tvm.build(schedule, [placeholder_a, placeholder_b], target) + buff_a = tvm.nd.array(inp, dev) + buff_b = tvm.nd.array(np.zeros(shape, dtype=placeholder_b.dtype), dev) + func(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), ref, rtol=1e-5) if __name__ == "__main__": - pytest.main([__pfile__]) + pytest.main([__file__]) diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py index edeb862cd5fcd..fa920e5135023 100644 --- a/tests/python/integration/test_scan.py +++ b/tests/python/integration/test_scan.py @@ -14,38 +14,43 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import tvm -from tvm import te +"""Test scheduling adn running scan operators.""" import numpy as np + +import tvm import tvm.testing +from tvm import te @tvm.testing.requires_gpu def test_scan(): - m = te.size_var("m") - n = te.size_var("n") - X = te.placeholder((m, n), name="X") - s_state = te.placeholder((m, n)) - s_init = te.compute((1, n), lambda _, i: X[0, i]) - s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i]) + """Test scan operators.""" + size_var_m = te.size_var("m") + size_var_n = te.size_var("n") + placeholder_x = te.placeholder((size_var_m, size_var_n), name="X") + s_state = te.placeholder((size_var_m, size_var_n)) + s_init = te.compute((1, size_var_n), lambda _, i: placeholder_x[0, i]) + s_update = te.compute( + (size_var_m, size_var_n), lambda t, i: s_state[t - 1, i] + placeholder_x[t, i] + ) scan = tvm.te.scan(s_init, s_update, s_state) # test scan + compute case - res = te.compute((m, n), lambda i, j: scan[i, j]) + res = te.compute((size_var_m, size_var_n), lambda i, j: scan[i, j]) # schedule - s = te.create_schedule(res.op) + schedule = te.create_schedule(res.op) num_thread = 256 block_x = te.thread_axis(None, "blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") - xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread) - s[s_init].bind(xo, block_x) - s[s_init].bind(xi, thread_x) - xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread) - s[s_update].bind(xo, block_x) - s[s_update].bind(xi, thread_x) - xo, xi = s[res].split(res.op.axis[1], factor=num_thread) - s[res].bind(xo, block_x) - s[res].bind(xi, thread_x) + axis_xo, axis_xi = schedule[s_init].split(s_init.op.axis[1], factor=num_thread) + schedule[s_init].bind(axis_xo, block_x) + schedule[s_init].bind(axis_xi, thread_x) + axis_xo, axis_xi = schedule[s_update].split(s_update.op.axis[1], factor=num_thread) + schedule[s_update].bind(axis_xo, block_x) + schedule[s_update].bind(axis_xi, thread_x) + axis_xo, axis_xi = schedule[res].split(res.op.axis[1], factor=num_thread) + schedule[res].bind(axis_xo, block_x) + schedule[res].bind(axis_xi, thread_x) # one line to build the function. def check_device(device): @@ -53,15 +58,15 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return - fscan = tvm.build(s, [X, res], device, name="myscan") + fscan = tvm.build(schedule, [placeholder_x, res], device, name="myscan") # launch the kernel. - n = 1024 - m = 10 - a_np = np.random.uniform(size=(m, n)).astype(res.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), dev) - fscan(a, b) - tvm.testing.assert_allclose(b.numpy(), np.cumsum(a_np, axis=0)) + num_n = 1024 + num_m = 10 + a_np = np.random.uniform(size=(num_m, num_n)).astype(res.dtype) + buff_a = tvm.nd.array(a_np, dev) + buff_b = tvm.nd.array(np.zeros((num_m, num_n), dtype=res.dtype), dev) + fscan(buff_a, buff_b) + tvm.testing.assert_allclose(buff_b.numpy(), np.cumsum(a_np, axis=0)) check_device("vulkan") check_device("cuda") diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py index 963609ea59017..04c5f85ce5d4c 100644 --- a/tests/python/integration/test_tuning.py +++ b/tests/python/integration/test_tuning.py @@ -19,11 +19,8 @@ """ import logging import multiprocessing as mp -import sys import textwrap -import time -import pytest import tvm import tvm.relay import tvm.testing @@ -34,100 +31,138 @@ from tvm.ir.instrument import pass_instrument from tvm.ir.transform import PassContext from tvm.target import Target +from tvm.tir.analysis import _ffi_api as _analysis_ffi_api def setup_module(): + """Setup the module used for testing.""" + @autotvm.template("testing/conv2d_no_batching") - def conv2d_no_batching(N, H, W, CI, CO, KH, KW): + def conv2d_no_batching( # pylint: disable=unused-variable + batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w + ): """An example template for testing""" - assert N == 1, "Only consider batch_size = 1 in this template" + assert batch_size == 1, "Only consider batch_size = 1 in this template" - data = te.placeholder((N, CI, H, W), name="data") - kernel = te.placeholder((CO, CI, KH, KW), name="kernel") + data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data") + kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel") - rc = te.reduce_axis((0, CI), name="rc") - ry = te.reduce_axis((0, KH), name="ry") - rx = te.reduce_axis((0, KW), name="rx") + axis_rc = te.reduce_axis((0, channels_in), name="rc") + axis_ry = te.reduce_axis((0, kernel_h), name="ry") + axis_rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( - (N, CO, H - KH + 1, W - KW + 1), + (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1), lambda nn, ff, yy, xx: te.sum( - data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx] + data[nn, axis_rc, yy + axis_ry, xx + axis_rx] + * kernel[ff, axis_rc, axis_ry, axis_rx], + axis=[axis_rc, axis_ry, axis_rx], ), tag="conv2d_nchw", ) - s = te.create_schedule([conv.op]) + schedule = te.create_schedule([conv.op]) output = conv - OL = s.cache_write(conv, "local") + cache_write_ol = schedule.cache_write(conv, "local") # create cache stage - AA = s.cache_read(data, "shared", [OL]) - WW = s.cache_read(kernel, "shared", [OL]) - AL = s.cache_read(AA, "local", [OL]) - WL = s.cache_read(WW, "local", [OL]) + cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol]) + cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol]) + cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol]) + cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol]) # tile and bind spatial axes - n, f, y, x = s[output].op.axis + axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis cfg = autotvm.get_config() - cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) - cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) - cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - kernel_scope = n # this is the scope to attach global config inside this kernel - - s[output].bind(bf, te.thread_axis("blockIdx.z")) - s[output].bind(by, te.thread_axis("blockIdx.y")) - s[output].bind(bx, te.thread_axis("blockIdx.x")) - s[output].bind(vf, te.thread_axis("vthread")) - s[output].bind(vy, te.thread_axis("vthread")) - s[output].bind(vx, te.thread_axis("vthread")) - s[output].bind(tf, te.thread_axis("threadIdx.z")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) - s[OL].compute_at(s[output], tx) + cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4) + cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4) + cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4) + axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f) + axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y) + axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x) + kernel_scope = axis_n # this is the scope to attach global config inside this kernel + + schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z")) + schedule[output].bind(axis_by, te.thread_axis("blockIdx.y")) + schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x")) + schedule[output].bind(axis_vf, te.thread_axis("vthread")) + schedule[output].bind(axis_vy, te.thread_axis("vthread")) + schedule[output].bind(axis_vx, te.thread_axis("vthread")) + schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z")) + schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y")) + schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x")) + schedule[output].reorder( + axis_n, + axis_bf, + axis_by, + axis_bx, + axis_vf, + axis_vy, + axis_vx, + axis_tf, + axis_ty, + axis_tx, + axis_fi, + axis_yi, + axis_xi, + ) + schedule[cache_write_ol].compute_at(schedule[output], axis_tx) # tile and bind reduction axes - n, f, y, x = s[OL].op.axis - rc, ry, rx = s[OL].op.reduce_axis - cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) - cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3) - cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3) - rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc) - ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry) - rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx) - s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) - - s[AA].compute_at(s[OL], rxo) - s[WW].compute_at(s[OL], rxo) - s[AL].compute_at(s[OL], rxm) - s[WL].compute_at(s[OL], rxm) + axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis + axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis + cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3) + cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3) + cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3) + axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc) + axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry) + axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx) + schedule[cache_write_ol].reorder( + axis_rco, + axis_ryo, + axis_rxo, + axis_rcm, + axis_rym, + axis_rxm, + axis_rci, + axis_ryi, + axis_rxi, + axis_n, + axis_f, + axis_y, + axis_x, + ) + + schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo) + schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo) + schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm) + schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm) # cooperative fetching - for load in [AA, WW]: - n, f, y, x = s[load].op.axis - fused = s[load].fuse(n, f, y, x) - tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) - ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) - tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) - s[load].bind(tz, te.thread_axis("threadIdx.z")) - s[load].bind(ty, te.thread_axis("threadIdx.y")) - s[load].bind(tx, te.thread_axis("threadIdx.x")) + for load in [cache_read_aa, cache_read_ww]: + axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis + fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x) + axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2]) + axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2]) + axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2]) + schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z")) + schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y")) + schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x")) # tune unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) - s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + schedule[output].pragma( + kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val + ) + schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) - return s, [data, kernel, conv] + return schedule, [data, kernel, conv] def teardown_module(): + """Remove the module from the autotvm task tables.""" # TODO(areusch): Tasks should not be registered into a global. del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"] @@ -158,8 +193,10 @@ def run_test_with_all_multiprocessing(func, *args, **kwargs): @tvm.testing.parametrize_targets("cuda", "opencl") -def test_tuning_gpu(target, dev): - def runner(target, dev): +def test_tuning_gpu(target): + """Test gpu tuning.""" + + def runner(target): # init task task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) @@ -181,22 +218,21 @@ def runner(target, dev): r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR - # Autotvm can filter some records before building if we know they won't work ahead of time. - # We can't guarantee we sample at least one good record so we count these as success too + # We filter records before building if we know they won't work ahead of time. + # We can't guarantee we get one good record so we count these as success too or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR ] assert len(successful_results) > 0, f"No successful tuning runs: {results!r}" - run_test_with_all_multiprocessing(runner, target, dev) + run_test_with_all_multiprocessing(runner, target) @tvm.testing.parametrize_targets("cuda", "opencl") -def test_tuning_gpu_inherits_pass_context(target, dev): +def test_tuning_gpu_inherits_pass_context(target): """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default. Test that using PassContext inherits passes properly but also runs gpu verification pass. """ - from tvm.tir.analysis import _ffi_api as _analysis_ffi_api @pass_instrument class PassInstrumentChecker: @@ -205,7 +241,7 @@ class PassInstrumentChecker: def __init__(self): self.has_been_run = False - def run_after_pass(self, mod, info): + def run_after_pass(self, *_): self.has_been_run = True class GPUVerifyPassMocked: @@ -274,10 +310,12 @@ def __init__( do_fork=False, runtime=None, ): + # pylint: disable=too-many-function-args super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime) + self.build_func = OverwrittenBuildFunc(tar.tar, runtime) - def runner(target, dev): + def runner(target): task, target = get_sample_task(target, None) logging.info("task config space: %s", task.config_space) @@ -295,10 +333,12 @@ def runner(target, dev): assert len(results) == 1 - run_test_with_all_multiprocessing(runner, target, dev) + run_test_with_all_multiprocessing(runner, target) def test_tuning_cpu(): + """Test tuning on cpu.""" + def runner(): ir_mod = tvm.parser.fromtext( textwrap.dedent( diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py index 71091f69d9649..b088b350c9f0c 100644 --- a/tests/python/integration/test_winograd_nnpack.py +++ b/tests/python/integration/test_winograd_nnpack.py @@ -14,18 +14,18 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test winograd convolution using nnpack impl.""" import numpy as np +from pytest import skip + import tvm -from tvm import te -from tvm import autotvm +import tvm.testing +import tvm.topi.testing +from tvm import autotvm, te, topi from tvm.autotvm.task.space import FallbackConfigEntity from tvm.contrib import nnpack from tvm.contrib.pickle_memoize import memoize -from tvm import topi -import tvm.topi.testing from tvm.topi.utils import get_const_tuple -from pytest import skip -import tvm.testing def verify_conv2d_nchw( @@ -36,11 +36,12 @@ def verify_conv2d_nchw( kernel, stride, padding, + devices, dilation=1, add_bias=False, add_relu=False, - devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"], ): + """Verify conv2d nchw workload.""" print( "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation) @@ -48,14 +49,14 @@ def verify_conv2d_nchw( in_height = in_width = in_size - A = te.placeholder((batch, in_channel, in_height, in_width), name="A") - W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") + placholder_a = te.placeholder((batch, in_channel, in_height, in_width), name="A") + placeholder_w = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") bias = te.placeholder((num_filter, 1, 1), name="bias") - a_shape = get_const_tuple(A.shape) - w_shape = get_const_tuple(W.shape) + a_shape = get_const_tuple(placholder_a.shape) + w_shape = get_const_tuple(placeholder_w.shape) bias_shape = get_const_tuple(bias.shape) - dtype = A.dtype + dtype = placholder_a.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): @@ -79,42 +80,52 @@ def check_device(device): print("Skipping %s becuase it is not enabled" % device) print("Running on target: %s" % device) with tvm.target.Target(device): - C = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NCHW", out_dtype=dtype) + result_c = topi.nn.conv2d( + placholder_a, + placeholder_w, + stride, + padding, + dilation, + layout="NCHW", + out_dtype=dtype, + ) if add_bias: - C = topi.add(C, bias) + result_c = topi.add(result_c, bias) if add_relu: - C = topi.nn.relu(C) - s = topi.generic.schedule_conv2d_nchw([C]) + result_c = topi.nn.relu(result_c) + schedule = topi.generic.schedule_conv2d_nchw([result_c]) - a = tvm.nd.array(a_np, dev) - w = tvm.nd.array(w_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) + buff_a = tvm.nd.array(a_np, dev) + buff_w = tvm.nd.array(w_np, dev) + buff_b = tvm.nd.array(b_np, dev) + buff_c = tvm.nd.array(np.zeros(get_const_tuple(result_c.shape), dtype=result_c.dtype), dev) if add_bias: func = tvm.build( - s, - [A, W, bias, C], + schedule, + [placholder_a, placeholder_w, bias, result_c], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation), ) - func(a, w, b, c) + func(buff_a, buff_w, buff_b, buff_c) else: func = tvm.build( - s, - [A, W, C], + schedule, + [placholder_a, placeholder_w, result_c], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation), ) - func(a, w, c) - tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4) + func(buff_a, buff_w, buff_c) + tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-4) for device in devices: check_device(device) class WinogradFallback(autotvm.FallbackContext): + """Winograd fallbacks.""" + def _query_inside(self, target, workload): key = (target, workload) if key in self.memory: @@ -126,6 +137,8 @@ def _query_inside(self, target, workload): def test_conv2d_nchw(): + """Verify conv2d nchw winograd works.""" + if not tvm.get_global_func( "tvm.contrib.nnpack.convolution_inference_without_weight_transform", True ): From 2c365b49b821484a988f82e805510b64809c4d1c Mon Sep 17 00:00:00 2001 From: Rafael Stahl Date: Fri, 8 Jul 2022 02:12:37 +0200 Subject: [PATCH 070/111] [TIR] fix crash when comparing IntImm to None (#12034) * [TIR] fix crash when comparing IntImm to None * [TIR] raise ValueError when comparing IntImm to None * fix: add test for non-pytest run --- src/tir/op/op.cc | 2 ++ tests/python/unittest/test_tir_base.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc index 73249921bf3be..456453a274290 100644 --- a/src/tir/op/op.cc +++ b/src/tir/op/op.cc @@ -99,6 +99,8 @@ PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s, Span s // The public function with a quick checking path. void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs, Span span) { // NOLINT(*) + CHECK(lhs.defined()) << "ValueError: `lhs` is null in the binary operator"; + CHECK(rhs.defined()) << "ValueError: `rhs` is null in the binary operator"; if (lhs.dtype() == rhs.dtype()) return; DataType ltype = lhs.dtype(); DataType rtype = rhs.dtype(); diff --git a/tests/python/unittest/test_tir_base.py b/tests/python/unittest/test_tir_base.py index 66f3ef9e599f3..3a67502313307 100644 --- a/tests/python/unittest/test_tir_base.py +++ b/tests/python/unittest/test_tir_base.py @@ -118,8 +118,20 @@ def test_exception(): x = tir.Var(name=1, dtype="int") +def test_eq_ops(): + a = tir.IntImm("int8", 1) + with pytest.raises(ValueError): + assert a != None + with pytest.raises(ValueError): + assert not a == None + b = tir.StringImm("abc") + assert b != None + assert not b == None + + if __name__ == "__main__": test_scalar_add() test_ret_const() test_control_flow_jump() test_exception() + test_eq_ops() From 3c4d8870ca2782e5dfb8c7b10180668e25c08150 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Thu, 7 Jul 2022 17:31:04 -0700 Subject: [PATCH 071/111] [MetaSchedule][Testing] Add unittests for C1D search space (#12036) --- .../unittest/test_meta_schedule_space_cpu.py | 179 ++++++++++++++++++ .../unittest/test_meta_schedule_space_cuda.py | 4 +- 2 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 tests/python/unittest/test_meta_schedule_space_cpu.py diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py new file mode 100644 index 0000000000000..c4cfc222e42d6 --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_space_cpu.py @@ -0,0 +1,179 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Tests for MetaSchedule search space on CPU""" +from tvm import meta_schedule as ms +from tvm.meta_schedule.testing.space_generation import check_sketches, print_sketches +from tvm.meta_schedule.testing.te_workload import create_te_workload +from tvm.script import tir as T +from tvm.target import Target + + +def _target(): + return Target("aws/cpu/c5.9xlarge") + + +def test_cpu_c1d(): + # fmt: off + @T.prim_func + def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 258, 64], dtype="float32") + conv1d_nlc_global = T.alloc_buffer([1, 128, 128], dtype="float32") + for i0, i1, i2 in T.grid(1, 258, 64): + with T.block("PadInput"): + i0_1, i1_1, i2_1 = T.axis.remap("SSS", [i0, i1, i2]) + T.reads(inputs[i0_1, i1_1 - 1, i2_1]) + T.writes(PadInput[i0_1, i1_1, i2_1]) + PadInput[i0_1, i1_1, i2_1] = T.if_then_else(1 <= i1_1 and i1_1 < 257, inputs[i0_1, i1_1 - 1, i2_1], T.float32(0), dtype="float32") + for i0_0, i1_0, i2_0, i0_1_1, i1_1_1, i2_1_1 in T.grid(1, 1, 2, 1, 1, 8): + for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1): + with T.block("conv1d_nlc"): + n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3) + l = T.axis.spatial(128, i1_1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3) + co = T.axis.spatial(128, (i2_0 * 8 + i2_1_1) * 8 + i2_2 + i2_3) + rl = T.axis.reduce(3, i3_0 * 3 + i3_1) + rc = T.axis.reduce(64, i4_0 + i4_1) + T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co]) + T.writes(conv1d_nlc_global[n, l, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv1d_nlc_global[n, l, co] = T.float32(0) + conv1d_nlc_global[n, l, co] = conv1d_nlc_global[n, l, co] + PadInput[n, l * 2 + rl, co // 128 * 64 + rc] * weight[rl, rc, co] + for ax0, ax1, ax2 in T.grid(1, 128, 8): + with T.block("conv1d_nlc_global"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(128, i2_0 * 64 + i2_1_1 * 8 + ax2) + T.reads(conv1d_nlc_global[v0, v1, v2]) + T.writes(conv1d_nlc[v0, v1, v2]) + conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2] + @T.prim_func + def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 258, 64], dtype="float32") + conv1d_nlc_global = T.alloc_buffer([1, 128, 128], dtype="float32") + for i0_0, i1_0, i2_0 in T.grid(1, 1, 2): + for i0_1, i1_1, i2_1 in T.grid(1, 1, 8): + for ax0, ax1, ax2 in T.grid(1, 257, 64): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(258, ax1) + i2 = T.axis.spatial(64, ax2) + T.reads(inputs[i0, i1 - 1, i2]) + T.writes(PadInput[i0, i1, i2]) + PadInput[i0, i1, i2] = T.if_then_else(1 <= i1 and i1 < 257, inputs[i0, i1 - 1, i2], T.float32(0), dtype="float32") + for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1): + with T.block("conv1d_nlc"): + n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3) + l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3) + co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3) + rl = T.axis.reduce(3, i3_0 * 3 + i3_1) + rc = T.axis.reduce(64, i4_0 + i4_1) + T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co]) + T.writes(conv1d_nlc_global[n, l, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv1d_nlc_global[n, l, co] = T.float32(0) + conv1d_nlc_global[n, l, co] = conv1d_nlc_global[n, l, co] + PadInput[n, l * 2 + rl, co // 128 * 64 + rc] * weight[rl, rc, co] + for ax0, ax1, ax2 in T.grid(1, 128, 64): + with T.block("conv1d_nlc_global"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(128, i2_0 * 64 + ax2) + T.reads(conv1d_nlc_global[v0, v1, v2]) + T.writes(conv1d_nlc[v0, v1, v2]) + conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2] + + @T.prim_func + def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64}) + for i0_0, i1_0, i2_0, i0_1, i1_1, i2_1, i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 8, 1, 64, 1, 64, 8, 3, 1, 1, 2, 1): + with T.block("conv1d_nlc"): + n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3) + l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3) + co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3) + rl = T.axis.reduce(3, i3_0 * 3 + i3_1) + rc = T.axis.reduce(64, i4_0 + i4_1) + T.reads(inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], weight[rl, rc, co]) + T.writes(conv1d_nlc[n, l, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv1d_nlc[n, l, co] = T.float32(0) + conv1d_nlc[n, l, co] = conv1d_nlc[n, l, co] + T.if_then_else(1 <= l * 2 + rl and l * 2 + rl < 257, inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], T.float32(0), dtype="float32") * weight[rl, rc, co] + # fmt: on + + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 64, 2]), + ("SamplePerfectTile", [2, 8, 8, 1]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [64, 1]), + ("SampleCategorical", 3), + ("SampleComputeLocation", -1), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 64, 2]), + ("SamplePerfectTile", [2, 8, 8, 1]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [64, 1]), + ("SampleCategorical", 3), + ("SampleComputeLocation", 5), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 64, 2]), + ("SamplePerfectTile", [2, 8, 8, 1]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [64, 1]), + ("SampleCategorical", 1), + ("SampleComputeLocation", -2), + ] + + mod = create_te_workload("C1D", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[c1d_0, c1d_1, c1d_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + +if __name__ == "__main__": + test_cpu_c1d() diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index e2c324cfda521..1ead63b9c115b 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -29,7 +29,7 @@ def _target(): def test_cuda_c1d(): # fmt: off @T.prim_func - def mod_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None: + def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) # body @@ -106,7 +106,7 @@ def mod_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12 check_sketches( mod, sketches=actual, - expected_mods=[mod_0], + expected_mods=[c1d_0], expected_decisions=[decision_0], ) From c412450f8ebe0a27b13c51c79c355edc95a893aa Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Fri, 8 Jul 2022 10:09:37 +0100 Subject: [PATCH 072/111] [TVMC] Updates TVMC tutorial with input shape information (#12031) The tutorial is currently broken, probably because updates in the model, so we now need to pass input shape information. Co-Authored-By: Liam Sturge Co-authored-by: Liam Sturge --- gallery/tutorial/tvmc_command_line_driver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py index ad5b37190c103..8a60f12a05b28 100644 --- a/gallery/tutorial/tvmc_command_line_driver.py +++ b/gallery/tutorial/tvmc_command_line_driver.py @@ -94,7 +94,7 @@ # # .. code-block:: bash # -# wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx +# wget https://github.com/onnx/models/raw/652f4e4af7975c8e7a505c4b6e0f8ac72d8260ea/vision/classification/resnet/model/resnet50-v2-7.onnx # ################################################################################ @@ -131,6 +131,7 @@ # # This may take several minutes depending on your machine # tvmc compile \ # --target "llvm" \ +# --input-shapes "data:[1,3,224,224]" \ # --output resnet50-v2-7-tvm.tar \ # resnet50-v2-7.onnx # From f769f4e2cc9f00c1d5cbf0b312dae7bfa2404841 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Fri, 8 Jul 2022 15:46:18 +0100 Subject: [PATCH 073/111] [microNPU] Test averge pool partitioning (#11965) Follow up for #11469. Change-Id: I474b1d43d3abc6b66d35ebcf3ad6fea50becfb97 --- .../contrib/test_ethosu/test_partition.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/python/contrib/test_ethosu/test_partition.py diff --git a/tests/python/contrib/test_ethosu/test_partition.py b/tests/python/contrib/test_ethosu/test_partition.py new file mode 100644 index 0000000000000..578485c8aa881 --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_partition.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=wrong-import-position + +""" +Tests to check that the NPU partitioning frontend partitions +only supported operations. +""" + +import pytest + +pytest.importorskip("ethosu.vela") + +import tvm +from tvm import relay +from tvm.relay.op.contrib import ethosu + + +@pytest.mark.parametrize( + "count_include_pad,pool_shape,padding", + [ + (True, [2, 2], [0, 0, 0, 0]), + (False, [2, 2], [4, 4, 5, 5]), + (False, [9, 9], [1, 1, 1, 1]), + ], +) +def test_invalid_avg_pool2d(count_include_pad, pool_shape, padding): + """ + Test unsupported variants of avg_pool2d don't get partitioned. + """ + ifm_shape = [1, 4, 4, 3] + strides = [2, 2] + + def get_graph(): + x = relay.var("x", shape=ifm_shape, dtype="int8") + x = relay.cast(x, dtype="int32") + x = relay.nn.avg_pool2d( + x, + pool_shape, + strides, + padding=padding, + layout="NHWC", + count_include_pad=count_include_pad, + ) + x = relay.cast(x, dtype="int8") + func = relay.Function(relay.analysis.free_vars(x), x) + return tvm.IRModule.from_expr(func) + + mod = relay.transform.InferType()(get_graph()) + partitioned_mod = ethosu.partition_for_ethosu(mod) + assert tvm.ir.structural_equal(mod, partitioned_mod) From 684a8381608f0978ea91539af7c9d3c2f6e85eaa Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Fri, 8 Jul 2022 10:25:56 -0700 Subject: [PATCH 074/111] [TIR] Avoid unnecessary dtype escalation in loop splitting (#12035) This PR introduces a type check to cast loop split decisions (sometimes given as `int64`) back to a smaller datatype when the loop variable's data type is smaller. This issue usually happens during reloading a trace from disk using JSON database and causes the failure of `CompactBufferAllocation` pass. --- src/tir/schedule/concrete_schedule.cc | 3 +++ tests/python/unittest/test_tir_schedule_split_fuse.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index c19735025ddc4..35f31ac9165cf 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -452,6 +452,9 @@ Array ConcreteScheduleNode::Split(const LoopRV& loop_rv, if (is_const_int(factor) && !is_positive_const(factor)) { throw NonPositiveFactorError(state_->mod, factor.as()->value, i); } + if (factor.dtype().bits() > loop->extent.dtype().bits()) { + factor = cast(loop->extent.dtype(), factor); + } factors.push_back(factor); tot_length *= factor; } diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py index 0bfac4e425b95..9fd678174dc0c 100644 --- a/tests/python/unittest/test_tir_schedule_split_fuse.py +++ b/tests/python/unittest/test_tir_schedule_split_fuse.py @@ -20,6 +20,7 @@ import tvm.testing from tvm import te, tir from tvm.script import tir as T +from tvm.tir.expr import IntImm from tvm.tir.schedule.testing import verify_trace_roundtrip # pylint: disable=no-member,invalid-name,unused-variable @@ -637,5 +638,13 @@ def _create_prim_func(): ) +def test_split_int64_factors(): + sch = tir.Schedule(elementwise_symbolic, debug_mask="all") + block_b = sch.get_block("B") + _, _, k = sch.get_loops(block_b) + sch.split(k, factors=[IntImm(dtype="int64", value=10), None]) + tvm.ir.assert_structural_equal(elementwise_symbolic_split, sch.mod["main"]) + + if __name__ == "__main__": tvm.testing.main() From a81e69a68fbf29231ea2ca1e5a14780e8a83e839 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 8 Jul 2022 13:10:54 -0700 Subject: [PATCH 075/111] [MetaSchedule][Test] Add unittests for C2D (#12043) --- .../unittest/test_meta_schedule_space_cpu.py | 177 ++++++++++++++++++ .../unittest/test_meta_schedule_space_cuda.py | 92 ++++++++- 2 files changed, 268 insertions(+), 1 deletion(-) diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py index c4cfc222e42d6..d6bfbde71fecd 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu.py @@ -175,5 +175,182 @@ def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12 ) +def test_cpu_c2d(): + # fmt: off + @T.prim_func + def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32") + conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 64], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1 in T.grid(1, 7, 4, 2, 1, 1, 28): + for ax0, ax1, ax2, ax3 in T.grid(1, 37, 7, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(230, i1_0 * 32 + ax1) + i2 = T.axis.spatial(230, i2_0 * 56 + i2_1 * 2 + ax2) + i3 = T.axis.spatial(3, ax3) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3]) + T.writes(PadInput[i0, i1, i2, i3]) + PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32") + for i3_1 in T.serial(8): + for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(112, ((i1_0 + i1_1) * 2 + i1_2) * 8 + i1_3) + w = T.axis.spatial(112, i2_0 * 28 + i2_1 + i2_2 + i2_3) + co = T.axis.spatial(64, (i3_0 * 8 + i3_1 + i3_2) * 4 + i3_3) + rh = T.axis.reduce(7, i4_0 + i4_1) + rw = T.axis.reduce(7, i5_0 + i5_1) + rc = T.axis.reduce(3, i6_0 * 3 + i6_1) + T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc_global[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_nhwc_global[n, h, w, co] = T.float32(0) + conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rh, rw, rc, co] + for ax0, ax1, ax2, ax3 in T.grid(1, 16, 1, 4): + with T.block("conv2d_nhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(112, i1_0 * 16 + ax1) + v2 = T.axis.spatial(112, i2_0 * 28 + i2_1 + ax2) + v3 = T.axis.spatial(64, i3_0 * 32 + i3_1 * 4 + ax3) + T.reads(conv2d_nhwc_global[v0, v1, v2, v3]) + T.writes(conv2d_nhwc[v0, v1, v2, v3]) + conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3] + @T.prim_func + def c2d_1(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32") + conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 64], dtype="float32") + for i0, i1, i2, i3 in T.grid(1, 230, 230, 3): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1]) + PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(3 <= i1_1 and i1_1 < 227 and 3 <= i2_1 and i2_1 < 227, inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1], T.float32(0), dtype="float32") + for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 7, 4, 2): + for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 28, 8, 7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1_1 + i0_0) + h = T.axis.spatial(112, ((i1_0 + i1_1_1) * 2 + i1_2) * 8 + i1_3) + w = T.axis.spatial(112, i2_0 * 28 + i2_1_1 + i2_2 + i2_3) + co = T.axis.spatial(64, (i3_0 * 8 + i3_1_1 + i3_2) * 4 + i3_3) + rh = T.axis.reduce(7, i4_0 + i4_1) + rw = T.axis.reduce(7, i5_0 + i5_1) + rc = T.axis.reduce(3, i6_0 * 3 + i6_1) + T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc_global[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_nhwc_global[n, h, w, co] = T.float32(0) + conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rh, rw, rc, co] + for ax0, ax1, ax2, ax3 in T.grid(1, 16, 28, 32): + with T.block("conv2d_nhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(112, i1_0 * 16 + ax1) + v2 = T.axis.spatial(112, i2_0 * 28 + ax2) + v3 = T.axis.spatial(64, i3_0 * 32 + ax3) + T.reads(conv2d_nhwc_global[v0, v1, v2, v3]) + T.writes(conv2d_nhwc[v0, v1, v2, v3]) + conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3] + @T.prim_func + def c2d_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32") + for i0_0, i1_0 in T.grid(1, 7): + for ax0, ax1, ax2, ax3 in T.grid(1, 37, 229, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(230, i1_0 * 32 + ax1) + i2 = T.axis.spatial(230, ax2) + i3 = T.axis.spatial(3, ax3) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3]) + T.writes(PadInput[i0, i1, i2, i3]) + PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32") + for i2_0, i3_0, i0_1, i1_1, i2_1, i3_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 2, 1, 1, 28, 8, 7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(112, ((i1_0 + i1_1) * 2 + i1_2) * 8 + i1_3) + w = T.axis.spatial(112, i2_0 * 28 + i2_1 + i2_2 + i2_3) + co = T.axis.spatial(64, (i3_0 * 8 + i3_1 + i3_2) * 4 + i3_3) + rh = T.axis.reduce(7, i4_0 + i4_1) + rw = T.axis.reduce(7, i5_0 + i5_1) + rc = T.axis.reduce(3, i6_0 * 3 + i6_1) + T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_nhwc[n, h, w, co] = T.float32(0) + conv2d_nhwc[n, h, w, co] = conv2d_nhwc[n, h, w, co] + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rh, rw, rc, co] + # fmt: on + + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [7, 1, 2, 8]), + ("SamplePerfectTile", [4, 28, 1, 1]), + ("SamplePerfectTile", [2, 8, 1, 4]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 1), + ("SampleComputeLocation", 6), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [7, 1, 2, 8]), + ("SamplePerfectTile", [4, 28, 1, 1]), + ("SamplePerfectTile", [2, 8, 1, 4]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 3), + ("SampleComputeLocation", -1), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [7, 1, 2, 8]), + ("SamplePerfectTile", [4, 28, 1, 1]), + ("SamplePerfectTile", [2, 8, 1, 4]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 0), + ("SampleComputeLocation", 1), + ] + + mod = create_te_workload("C2D", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[c2d_0, c2d_1, c2d_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + if __name__ == "__main__": test_cpu_c1d() + test_cpu_c2d() diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index 1ead63b9c115b..472a7ccc13deb 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -16,7 +16,7 @@ # under the License. """Tests for MetaSchedule search space on CUDA""" from tvm import meta_schedule as ms -from tvm.meta_schedule.testing.space_generation import check_sketches +from tvm.meta_schedule.testing.space_generation import check_sketches, print_sketches from tvm.meta_schedule.testing.te_workload import create_te_workload from tvm.script import tir as T from tvm.target import Target @@ -111,5 +111,95 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12 ) +def test_cuda_c2d(): + # fmt: off + @T.prim_func + def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.unroll_explicit":16}) + conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local") + PadInput_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") + weight_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") + for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(16, thread="blockIdx.x"): + for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(56, thread="vthread.x"): + for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(14, thread="threadIdx.x"): + for i4_0, i5_0, i6_0 in T.grid(1, 1, 1): + for ax0_ax1_ax2_ax3_fused in T.serial(80379): + with T.block("PadInput_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(230, ax0_ax1_ax2_ax3_fused % 80379 // 351) + v2 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 8 * 112 + ax0_ax1_ax2_ax3_fused % 351 // 3) + v3 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 3) + T.reads(inputs[v0, v1 - 3, v2 - 3, v3]) + T.writes(PadInput_shared[v0, v1, v2, v3]) + T.block_attr({"meta_schedule.cooperative_fetch":2}) + PadInput_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, inputs[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") + for ax0_ax1_ax2_ax3_fused in T.serial(1176): + with T.block("weight_shared"): + v0 = T.axis.spatial(7, ax0_ax1_ax2_ax3_fused // 168) + v1 = T.axis.spatial(7, ax0_ax1_ax2_ax3_fused % 168 // 24) + v2 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 24 // 8) + v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 8 * 8 + ax0_ax1_ax2_ax3_fused % 8) + T.reads(weight[v0, v1, v2, v3]) + T.writes(weight_shared[v0, v1, v2, v3]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3] + for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 7, 1, 1, 8, 4, 1, 7, 1, 3, 1, 1, 1, 2): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0) + h = T.axis.spatial(112, ((0 + 0) * 14 + i0_2_i1_2_i2_2_i3_2_fused % 14) * 8 + i1_3 + i1_4) + w = T.axis.spatial(112, (i0_0_i1_0_i2_0_i3_0_fused % 16 // 8 * 14 + i0_1_i1_1_i2_1_i3_1_fused % 56 // 4 + 0) * 4 + i2_3 + i2_4) + co = T.axis.spatial(64, (i0_0_i1_0_i2_0_i3_0_fused % 8 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 4 + 0 + i3_3) * 2 + i3_4) + rh = T.axis.reduce(7, (i4_0 + i4_1) * 7 + i4_2) + rw = T.axis.reduce(7, i5_0 * 7 + i5_1 + i5_2) + rc = T.axis.reduce(3, (i6_0 + i6_1) * 3 + i6_2) + T.reads(PadInput_shared[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight_shared[rh, rw, rc, co]) + T.writes(conv2d_nhwc_local[n, h, w, co]) + T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) + with T.init(): + conv2d_nhwc_local[n, h, w, co] = T.float32(0) + conv2d_nhwc_local[n, h, w, co] = conv2d_nhwc_local[n, h, w, co] + PadInput_shared[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight_shared[rh, rw, rc, co] + for ax0, ax1, ax2, ax3 in T.grid(1, 8, 4, 2): + with T.block("conv2d_nhwc_local"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused * 8 + ax1) + v2 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 8 * 56 + i0_1_i1_1_i2_1_i3_1_fused // 4 * 4 + ax2) + v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 8 * 8 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 2 + ax3) + T.reads(conv2d_nhwc_local[v0, v1, v2, v3]) + T.writes(conv2d_nhwc[v0, v1, v2, v3]) + conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 14, 8, 1]), + ("SamplePerfectTile", [2, 14, 1, 4, 1]), + ("SamplePerfectTile", [8, 4, 1, 1, 2]), + ("SamplePerfectTile", [1, 1, 7]), + ("SamplePerfectTile", [1, 7, 1]), + ("SamplePerfectTile", [1, 1, 3]), + ("SampleCategorical", 1), + ("SampleCategorical", 3), + ("SampleCategorical", 1), + ] + + mod = create_te_workload("C2D", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[c2d_0], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": test_cuda_c1d() + test_cuda_c2d() From 07672d0b41005b115ea6c8a39eb496e1ad8c38e4 Mon Sep 17 00:00:00 2001 From: Andrey Malyshev Date: Sat, 9 Jul 2022 00:44:18 +0300 Subject: [PATCH 076/111] [Texture] Add memory scope entity into graph JSON/runtime (#11875) This PR is a split part of origin PR #11357 Co-authored-by: Chris Sullivan --- src/relay/backend/graph_executor_codegen.cc | 27 ++++++++- src/runtime/graph_executor/graph_executor.cc | 58 +++++++++++++++----- src/runtime/graph_executor/graph_executor.h | 14 ++++- src/target/source/codegen_opencl.cc | 4 +- 4 files changed, 85 insertions(+), 18 deletions(-) diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index faf9d2899fc3a..c72511775acd6 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -326,6 +326,12 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorattrs_["device_index"] = device_types; } + // storage scope + std::vector storage_scope; + for (const auto& virtual_device : storage_info->virtual_devices) { + storage_scope.push_back(std::string(virtual_device->memory_scope)); + } + node->attrs_["storage_scope"] = std::move(storage_scope); auto node_id = nodes_.size(); nodes_.push_back(node); // Tuple return value, flatten as tuple @@ -442,7 +448,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorattrs.defined()) { // Call is an extern function - std::cout << "call_node: \n" << PrettyPrint(call) << std::endl; const auto* func = call_node->op.as(); ICHECK(func) << "Expected the operator to be a global var, but got " << call_node->op->GetTypeKey(); // getting a relay fn here, not sure why. @@ -539,12 +544,15 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator storage_ids; + std::vector storage_scopes; std::vector device_types; std::vector dltypes; std::vector node_row_ptr{0}; for (auto node : nodes_) { const auto& shape_vec = dmlc::get(node->attrs_["shape"]); const auto& storage_id = dmlc::get>(node->attrs_["storage_id"]); + const auto& storage_scope = + dmlc::get>(node->attrs_["storage_scope"]); const auto& dtype_vec = dmlc::get>(node->attrs_["dtype"]); ICHECK_EQ(node->num_outputs_, shape_vec.size()); @@ -553,12 +561,25 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorattrs_.count("device_index")) { const auto& dev_types = dmlc::get>(node->attrs_["device_index"]); device_types.insert(device_types.end(), dev_types.begin(), dev_types.end()); } node_row_ptr.push_back(num_entry); } + + // verification if storage_scope contains any non global memory scope + // in other case it's better not to write scopes to the JSON at all + bool global_only_scope = true; + for (const auto& ss : storage_scopes) { + if (!(ss.empty() || ss == "global")) { + global_only_scope = false; + } + } + if (global_only_scope) { + storage_scopes.clear(); + } writer->BeginObject(); writer->WriteObjectKeyValue("nodes", nodes_); writer->WriteObjectKeyValue("arg_nodes", arg_nodes); @@ -572,6 +593,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorWriteObjectKeyValue("attrs", attrs); diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 8ae98d930f139..78e65f6f2319f 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -42,6 +42,7 @@ #include #include "../file_utils.h" +#include "../texture.h" namespace tvm { namespace runtime { @@ -51,6 +52,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) { if (align < kAllocAlignment) return kAllocAlignment; return align; } +constexpr auto Is2DStorage = IsTextureStorage; } // namespace details /*! @@ -361,24 +363,16 @@ void GraphExecutor::SetupStorage() { // Find the maximum space size. for (size_t i = 0; i < attrs_.shape.size(); ++i) { int storage_id = attrs_.storage_id[i]; + std::string storage_scope = attrs_.storage_scope.empty() ? "" : attrs_.storage_scope[i]; // Use the fallback device if no device index is available. int device_type = static_cast(devices_[0].device_type); if (!attrs_.device_index.empty()) { device_type = attrs_.device_index[i]; } - size_t size = 1; - for (int64_t sz : attrs_.shape[i]) { - size *= static_cast(sz); - } - ICHECK_GE(storage_id, 0) << "Do not support runtime shape op"; - DLDataType t = vtype[i]; - size_t bits = t.bits * t.lanes; - ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); - size_t bytes = ((bits + 7U) / 8U) * size; uint32_t sid = static_cast(storage_id); if (sid >= pool_entry.size()) { - pool_entry.resize(sid + 1, {0, -1}); + pool_entry.resize(sid + 1, {-1, {0}, {}}); } else { ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type) << "The same pool entry cannot be assigned to multiple devices"; @@ -395,8 +389,38 @@ void GraphExecutor::SetupStorage() { pool_entry[sid].linked_param = lookup_rv; } pool_entry[sid].param_data_entry = i; - pool_entry[sid].size = std::max(pool_entry[sid].size, bytes); pool_entry[sid].device_type = device_type; + pool_entry[sid].scope = storage_scope; + + DLDataType t = vtype[i]; + if (!details::Is2DStorage(storage_scope)) { + size_t size = 1; + for (int64_t sz : attrs_.shape[i]) { + size *= static_cast(sz); + } + size_t bits = t.bits * t.lanes; + ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U); + int64_t bytes = ((bits + 7U) / 8U) * size; + pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes); + pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1}; + } else { + if (pool_entry[sid].shape.size() == 1) { + pool_entry[sid].shape.resize(3, 0); + } + size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope); + auto shape = ApplyTexture2DFlattening(attrs_.shape[i], attrs_.shape[i].size(), axis); + pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height); + pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width); + CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel) + << pool_entry[sid].shape[2] << " != " << shape.channel + << ", texture channel length must be consistent within a storage pool"; + pool_entry[sid].shape[2] = shape.channel; + CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t)) + << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t) + << ", pool entry for 2d texure allocations must be of the same type;" + << " downstream error from memory planner likely"; + pool_entry[sid].dtype = t; + } } // Allocate the space. @@ -410,9 +434,15 @@ void GraphExecutor::SetupStorage() { if (pit.linked_param.defined()) { storage_pool_.push_back(pit.linked_param); } else { - std::vector shape; - shape.push_back(static_cast(pit.size + 3) / 4); - storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev)); + std::vector shape = pit.shape; + if (shape.size() == 1) { + shape[0] = (shape[0] + 3) / 4; + } + Optional mem_scope; + if (!pit.scope.empty()) { + mem_scope = String(pit.scope); + } + storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, dev, mem_scope)); } } diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index 2564f5b0d924b..bbe94636b3a14 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -204,10 +204,12 @@ class TVM_DLL GraphExecutor : public ModuleNode { protected: // Memory pool entry. struct PoolEntry { - size_t size; int device_type; + std::vector shape; + DLDataType dtype; int param_data_entry; NDArray linked_param; + std::string scope; // PoolEntry(int s, int dev_type, void* pre_linked_param) : // size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {} }; @@ -303,6 +305,7 @@ class TVM_DLL GraphExecutor : public ModuleNode { std::vector storage_id; std::vector device_index; std::vector dltype; + std::vector storage_scope; std::vector> shape; // The graph attribute fields. void Load(dmlc::JSONReader* reader) { @@ -328,6 +331,15 @@ class TVM_DLL GraphExecutor : public ModuleNode { reader->Read(&storage_id); ICHECK(!reader->NextArrayItem()); bitmask |= 2; + } else if (key == "storage_scope") { + reader->BeginArray(); + ICHECK(reader->NextArrayItem()); + reader->Read(&type); + ICHECK_EQ(type, "list_str"); + ICHECK(reader->NextArrayItem()); + reader->Read(&storage_scope); + ICHECK(!reader->NextArrayItem()); + bitmask |= 1; } else if (key == "shape") { reader->BeginArray(); ICHECK(reader->NextArrayItem()); diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 5d04d00339fc4..e8d47b720bf6a 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -98,7 +98,7 @@ std::string CodeGenOpenCL::Finish() { "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n" "#else\n" "#error \"Half precision floating point not supported" - "by OpenCL implementation on your device.\" \n" + " by OpenCL implementation on your device.\" \n" "#endif\n\n"; } @@ -109,7 +109,7 @@ std::string CodeGenOpenCL::Finish() { "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" "#else\n" "#error \"Double precision floating point not supported" - "by OpenCL implementation on your device.\" \n" + " by OpenCL implementation on your device.\" \n" "#endif\n\n"; } From 6c9356fd18d0be4282acf3d428ce6f72f8e91e52 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 8 Jul 2022 17:06:28 -0700 Subject: [PATCH 077/111] [MetaSchedule][Test] Add unittests for C3D (#12046) --- .../unittest/test_meta_schedule_space_cpu.py | 198 ++++++++++++++++++ .../unittest/test_meta_schedule_space_cuda.py | 96 +++++++++ 2 files changed, 294 insertions(+) diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py index d6bfbde71fecd..259f0da07b49c 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu.py @@ -351,6 +351,204 @@ def c2d_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, ) +def test_cpu_c3d(): + # fmt: off + @T.prim_func + def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32") + conv3d_ndhwc_global = T.alloc_buffer([1, 8, 112, 112, 64], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i4_0 in T.grid(1, 2, 4, 1, 2): + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 13, 61, 229, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(22, i1_0 * 8 + ax1) + i2 = T.axis.spatial(230, i2_0 * 56 + ax2) + i3 = T.axis.spatial(230, ax3) + i4 = T.axis.spatial(3, ax4) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4]) + T.writes(PadInput[i0, i1, i2, i3, i4]) + PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32") + for i0_1, i1_1, i2_1, i3_1, i4_1 in T.grid(1, 4, 4, 14, 1): + for i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1): + with T.block("conv3d_ndhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3) + h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3) + w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3) + co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3) + rd = T.axis.reduce(7, i5_0 * 7 + i5_1) + rh = T.axis.reduce(7, i6_0 + i6_1) + rw = T.axis.reduce(7, i7_0 + i7_1) + rc = T.axis.reduce(3, i8_0 + i8_1) + T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co]) + T.writes(conv3d_ndhwc_global[n, d, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv3d_ndhwc_global[n, d, h, w, co] = T.float32(0) + conv3d_ndhwc_global[n, d, h, w, co] = conv3d_ndhwc_global[n, d, h, w, co] + PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rd, rh, rw, rc, co] + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 1, 7, 8, 32): + with T.block("conv3d_ndhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(8, i1_0 * 4 + i1_1 + ax1) + v2 = T.axis.spatial(112, i2_0 * 28 + i2_1 * 7 + ax2) + v3 = T.axis.spatial(112, i3_1 * 8 + ax3) + v4 = T.axis.spatial(64, i4_0 * 32 + ax4) + T.reads(conv3d_ndhwc_global[v0, v1, v2, v3, v4]) + T.writes(conv3d_ndhwc[v0, v1, v2, v3, v4]) + conv3d_ndhwc[v0, v1, v2, v3, v4] = conv3d_ndhwc_global[v0, v1, v2, v3, v4] + @T.prim_func + def c3d_1(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32") + conv3d_ndhwc_global = T.alloc_buffer([1, 8, 112, 112, 64], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i4_0 in T.grid(1, 2, 4, 1, 2): + for i0_1, i1_1, i2_1, i3_1 in T.grid(1, 4, 4, 14): + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 7, 19, 21, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(22, i1_0 * 8 + i1_1 * 2 + ax1) + i2 = T.axis.spatial(230, i2_0 * 56 + i2_1 * 14 + ax2) + i3 = T.axis.spatial(230, i3_1 * 16 + ax3) + i4 = T.axis.spatial(3, ax4) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4]) + T.writes(PadInput[i0, i1, i2, i3, i4]) + PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32") + for i4_1, i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1): + with T.block("conv3d_ndhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3) + h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3) + w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3) + co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3) + rd = T.axis.reduce(7, i5_0 * 7 + i5_1) + rh = T.axis.reduce(7, i6_0 + i6_1) + rw = T.axis.reduce(7, i7_0 + i7_1) + rc = T.axis.reduce(3, i8_0 + i8_1) + T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co]) + T.writes(conv3d_ndhwc_global[n, d, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv3d_ndhwc_global[n, d, h, w, co] = T.float32(0) + conv3d_ndhwc_global[n, d, h, w, co] = conv3d_ndhwc_global[n, d, h, w, co] + PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rd, rh, rw, rc, co] + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 4, 28, 112, 32): + with T.block("conv3d_ndhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(8, i1_0 * 4 + ax1) + v2 = T.axis.spatial(112, i2_0 * 28 + ax2) + v3 = T.axis.spatial(112, ax3) + v4 = T.axis.spatial(64, i4_0 * 32 + ax4) + T.reads(conv3d_ndhwc_global[v0, v1, v2, v3, v4]) + T.writes(conv3d_ndhwc[v0, v1, v2, v3, v4]) + conv3d_ndhwc[v0, v1, v2, v3, v4] = conv3d_ndhwc_global[v0, v1, v2, v3, v4] + @T.prim_func + def c3d_2(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i4_0, i0_1, i1_1, i2_1, i3_1 in T.grid(1, 2, 4, 1, 2, 1, 4, 4, 14): + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 7, 19, 21, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(22, i1_0 * 8 + i1_1 * 2 + ax1) + i2 = T.axis.spatial(230, i2_0 * 56 + i2_1 * 14 + ax2) + i3 = T.axis.spatial(230, i3_1 * 16 + ax3) + i4 = T.axis.spatial(3, ax4) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4]) + T.writes(PadInput[i0, i1, i2, i3, i4]) + PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32") + for i4_1, i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1): + with T.block("conv3d_ndhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3) + h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3) + w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3) + co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3) + rd = T.axis.reduce(7, i5_0 * 7 + i5_1) + rh = T.axis.reduce(7, i6_0 + i6_1) + rw = T.axis.reduce(7, i7_0 + i7_1) + rc = T.axis.reduce(3, i8_0 + i8_1) + T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co]) + T.writes(conv3d_ndhwc[n, d, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv3d_ndhwc[n, d, h, w, co] = T.float32(0) + conv3d_ndhwc[n, d, h, w, co] = conv3d_ndhwc[n, d, h, w, co] + PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rd, rh, rw, rc, co] + # fmt: on + + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [2, 4, 1, 1]), + ("SamplePerfectTile", [4, 4, 1, 7]), + ("SamplePerfectTile", [1, 14, 1, 8]), + ("SamplePerfectTile", [2, 1, 32, 1]), + ("SamplePerfectTile", [1, 7]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [3, 1]), + ("SampleCategorical", 3), + ("SampleComputeLocation", 4), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [2, 4, 1, 1]), + ("SamplePerfectTile", [4, 4, 1, 7]), + ("SamplePerfectTile", [1, 14, 1, 8]), + ("SamplePerfectTile", [2, 1, 32, 1]), + ("SamplePerfectTile", [1, 7]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [3, 1]), + ("SampleCategorical", 2), + ("SampleComputeLocation", 8), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [2, 4, 1, 1]), + ("SamplePerfectTile", [4, 4, 1, 7]), + ("SamplePerfectTile", [1, 14, 1, 8]), + ("SamplePerfectTile", [2, 1, 32, 1]), + ("SamplePerfectTile", [1, 7]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [3, 1]), + ("SampleCategorical", 1), + ("SampleComputeLocation", 8), + ] + + mod = create_te_workload("C3D", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[c3d_0, c3d_1, c3d_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + if __name__ == "__main__": test_cpu_c1d() test_cpu_c2d() + test_cpu_c3d() diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index 472a7ccc13deb..277f74d888d01 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -200,6 +200,102 @@ def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, ) +def test_cuda_c3d(): + # fmt: off + @T.prim_func + def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.unroll_explicit":16}) + conv3d_ndhwc_local = T.alloc_buffer([1, 8, 112, 112, 64], dtype="float32", scope="local") + PadInput_shared = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32", scope="shared") + weight_shared = T.alloc_buffer([7, 7, 7, 3, 64], dtype="float32", scope="shared") + for i0_0_i1_0_i2_0_i3_0_i4_0_fused in T.thread_binding(2, thread="blockIdx.x"): + for i0_1_i1_1_i2_1_i3_1_i4_1_fused in T.thread_binding(8, thread="vthread.x"): + for i0_2_i1_2_i2_2_i3_2_i4_2_fused in T.thread_binding(392, thread="threadIdx.x"): + for i5_0, i6_0, i7_0, i8_0 in T.grid(1, 1, 1, 1): + for ax0_ax1_ax2_ax3_ax4_fused in T.serial(1687959): + with T.block("PadInput_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(22, ax0_ax1_ax2_ax3_ax4_fused % 1687959 // 80379) + v2 = T.axis.spatial(230, ax0_ax1_ax2_ax3_ax4_fused % 80379 // 351) + v3 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_i4_0_fused * 112 + ax0_ax1_ax2_ax3_ax4_fused % 351 // 3) + v4 = T.axis.spatial(3, ax0_ax1_ax2_ax3_ax4_fused % 3) + T.reads(inputs[v0, v1 - 3, v2 - 3, v3 - 3, v4]) + T.writes(PadInput_shared[v0, v1, v2, v3, v4]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + PadInput_shared[v0, v1, v2, v3, v4] = T.if_then_else(3 <= v1 and v1 < 19 and 3 <= v2 and v2 < 227 and 3 <= v3 and v3 < 227, inputs[v0, v1 - 3, v2 - 3, v3 - 3, v4], T.float32(0), dtype="float32") + for ax0_ax1_ax2_ax3_ax4_fused in T.serial(65856): + with T.block("weight_shared"): + v0 = T.axis.spatial(7, ax0_ax1_ax2_ax3_ax4_fused // 9408) + v1 = T.axis.spatial(7, ax0_ax1_ax2_ax3_ax4_fused % 9408 // 1344) + v2 = T.axis.spatial(7, ax0_ax1_ax2_ax3_ax4_fused % 1344 // 192) + v3 = T.axis.spatial(3, ax0_ax1_ax2_ax3_ax4_fused % 192 // 64) + v4 = T.axis.spatial(64, ax0_ax1_ax2_ax3_ax4_fused % 64) + T.reads(weight[v0, v1, v2, v3, v4]) + T.writes(weight_shared[v0, v1, v2, v3, v4]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + weight_shared[v0, v1, v2, v3, v4] = weight[v0, v1, v2, v3, v4] + for i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_2, i6_2, i7_2, i8_2, i0_4, i1_4, i2_4, i3_4, i4_4 in T.grid(7, 7, 1, 3, 1, 2, 2, 1, 32, 1, 1, 7, 1, 1, 1, 2, 4, 1): + with T.block("conv3d_ndhwc"): + n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0) + d = T.axis.spatial(8, ((0 + 0) * 4 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 392 // 98) * 2 + i1_3 + i1_4) + h = T.axis.spatial(112, (((0 * 4 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 8 // 2) * 7 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 98 // 14) * 2 + i2_3) * 2 + i2_4) + w = T.axis.spatial(112, ((i0_0_i1_0_i2_0_i3_0_i4_0_fused % 2 * 2 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 2) * 7 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 14 // 2 + i3_3) * 4 + i3_4) + co = T.axis.spatial(64, ((0 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 2) * 32 + i4_3 + i4_4) + rd = T.axis.reduce(7, i5_0 * 7 + i5_1 + i5_2) + rh = T.axis.reduce(7, i6_0 * 7 + i6_1 + i6_2) + rw = T.axis.reduce(7, (i7_0 + i7_1) * 7 + i7_2) + rc = T.axis.reduce(3, i8_0 * 3 + i8_1 + i8_2) + T.reads(PadInput_shared[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight_shared[rd, rh, rw, rc, co]) + T.writes(conv3d_ndhwc_local[n, d, h, w, co]) + T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) + with T.init(): + conv3d_ndhwc_local[n, d, h, w, co] = T.float32(0) + conv3d_ndhwc_local[n, d, h, w, co] = conv3d_ndhwc_local[n, d, h, w, co] + PadInput_shared[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight_shared[rd, rh, rw, rc, co] + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 2, 4, 4, 32): + with T.block("conv3d_ndhwc_local"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(8, i0_2_i1_2_i2_2_i3_2_i4_2_fused // 98 * 2 + ax1) + v2 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_i4_1_fused // 2 * 28 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 98 // 14 * 4 + ax2) + v3 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_i4_0_fused * 56 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 2 * 28 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 14 // 2 * 4 + ax3) + v4 = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_i4_2_fused % 2 * 32 + ax4) + T.reads(conv3d_ndhwc_local[v0, v1, v2, v3, v4]) + T.writes(conv3d_ndhwc[v0, v1, v2, v3, v4]) + conv3d_ndhwc[v0, v1, v2, v3, v4] = conv3d_ndhwc_local[v0, v1, v2, v3, v4] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 4, 2, 1]), + ("SamplePerfectTile", [1, 4, 7, 2, 2]), + ("SamplePerfectTile", [2, 2, 7, 1, 4]), + ("SamplePerfectTile", [1, 1, 2, 32, 1]), + ("SamplePerfectTile", [1, 7, 1]), + ("SamplePerfectTile", [1, 7, 1]), + ("SamplePerfectTile", [1, 1, 7]), + ("SamplePerfectTile", [1, 3, 1]), + ("SampleCategorical", 3), + ("SampleCategorical", 2), + ("SampleCategorical", 1), + ] + mod = create_te_workload("C3D", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[c3d_0], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": test_cuda_c1d() test_cuda_c2d() + test_cuda_c3d() From 00ce86d68b123f3389b0fca1eca72e81f6054443 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 8 Jul 2022 19:01:54 -0700 Subject: [PATCH 078/111] [MetaSchedule][Test] Add unittests for CAP (#12047) --- .../unittest/test_meta_schedule_space_cpu.py | 194 ++++++++++++++++++ .../unittest/test_meta_schedule_space_cuda.py | 102 +++++++++ 2 files changed, 296 insertions(+) diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py index 259f0da07b49c..87f61ec32880a 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu.py @@ -548,7 +548,201 @@ def c3d_2(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7 ) +def test_cpu_cap(): + # fmt: off + @T.prim_func + def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32") + conv2d_capsule_nhwijc_global = T.alloc_buffer([1, 8, 8, 4, 4, 32], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0, i0_1, i1_1 in T.grid(1, 2, 1, 1, 1, 1, 1, 4): + for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 3, 17, 4, 4, 32): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(18, i1_0 * 8 + i1_1 * 2 + ax1) + i2 = T.axis.spatial(18, ax2) + i3, i4, i5 = T.axis.remap("SSS", [ax3, ax4, ax5]) + T.reads(inputs[i0, i1 - 1, i2 - 1, i3, i4, i5]) + T.writes(PadInput[i0, i1, i2, i3, i4, i5]) + PadInput[i0, i1, i2, i3, i4, i5] = T.if_then_else(1 <= i1 and i1 < 17 and 1 <= i2 and i2 < 17, inputs[i0, i1 - 1, i2 - 1, i3, i4, i5], T.float32(0), dtype="float32") + for i2_1, i3_1, i4_1, i5_1 in T.grid(4, 1, 4, 2): + for i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16): + with T.block("conv2d_capsule_nhwijc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3) + w = T.axis.spatial(8, (i2_0 * 4 + i2_1) * 2 + i2_2 + i2_3) + cap_i = T.axis.spatial(4, (i3_0 + i3_1 + i3_2) * 4 + i3_3) + cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1 + i4_2 + i4_3) + co = T.axis.spatial(32, (i5_0 * 2 + i5_1 + i5_2) * 16 + i5_3) + rh = T.axis.reduce(3, i6_0 * 3 + i6_1) + rw = T.axis.reduce(3, i7_0 + i7_1) + cap_k = T.axis.reduce(4, i8_0 + i8_1) + rc = T.axis.reduce(32, i9_0 * 32 + i9_1) + T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co]) + T.writes(conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = T.float32(0) + conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] + PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight[rh, rw, cap_k, cap_j, rc, co] + for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 1, 2, 4, 1, 16): + with T.block("conv2d_capsule_nhwijc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(8, i1_0 * 4 + i1_1 + ax1) + v2 = T.axis.spatial(8, i2_1 * 2 + ax2) + v3 = T.axis.spatial(4, ax3) + v4 = T.axis.spatial(4, i4_1 + ax4) + v5 = T.axis.spatial(32, i5_1 * 16 + ax5) + T.reads(conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5]) + T.writes(conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5]) + conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5] = conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5] + @T.prim_func + def cap_1(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32") + conv2d_capsule_nhwijc_global = T.alloc_buffer([1, 8, 8, 4, 4, 32], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0 in T.grid(1, 2, 1, 1, 1, 1): + for i0_1, i1_1, i2_1, i3_1, i4_1, i5_1 in T.grid(1, 4, 4, 1, 4, 2): + for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 3, 5, 4, 4, 32): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(18, i1_0 * 8 + i1_1 * 2 + ax1) + i2 = T.axis.spatial(18, i2_1 * 4 + ax2) + i3, i4, i5 = T.axis.remap("SSS", [ax3, ax4, ax5]) + T.reads(inputs[i0, i1 - 1, i2 - 1, i3, i4, i5]) + T.writes(PadInput[i0, i1, i2, i3, i4, i5]) + PadInput[i0, i1, i2, i3, i4, i5] = T.if_then_else(1 <= i1 and i1 < 17 and 1 <= i2 and i2 < 17, inputs[i0, i1 - 1, i2 - 1, i3, i4, i5], T.float32(0), dtype="float32") + for i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16): + with T.block("conv2d_capsule_nhwijc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3) + w = T.axis.spatial(8, (i2_0 * 4 + i2_1) * 2 + i2_2 + i2_3) + cap_i = T.axis.spatial(4, (i3_0 + i3_1 + i3_2) * 4 + i3_3) + cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1 + i4_2 + i4_3) + co = T.axis.spatial(32, (i5_0 * 2 + i5_1 + i5_2) * 16 + i5_3) + rh = T.axis.reduce(3, i6_0 * 3 + i6_1) + rw = T.axis.reduce(3, i7_0 + i7_1) + cap_k = T.axis.reduce(4, i8_0 + i8_1) + rc = T.axis.reduce(32, i9_0 * 32 + i9_1) + T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co]) + T.writes(conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = T.float32(0) + conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] + PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight[rh, rw, cap_k, cap_j, rc, co] + for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 4, 8, 4, 4, 32): + with T.block("conv2d_capsule_nhwijc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(8, i1_0 * 4 + ax1) + v2, v3, v4, v5 = T.axis.remap("SSSS", [ax2, ax3, ax4, ax5]) + T.reads(conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5]) + T.writes(conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5]) + conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5] = conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5] + @T.prim_func + def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32") + for i0, i1, i2, i3, i4, i5 in T.grid(1, 18, 18, 4, 4, 32): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1, i4_1, i5_1 = T.axis.remap("SSSSSS", [i0, i1, i2, i3, i4, i5]) + T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1, i4_1, i5_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1, i4_1, i5_1]) + PadInput[i0_1, i1_1, i2_1, i3_1, i4_1, i5_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1, i4_1, i5_1], T.float32(0), dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_1_1, i5_1_1, i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 2, 1, 1, 1, 1, 1, 4, 4, 1, 4, 2, 1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16): + with T.block("conv2d_capsule_nhwijc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1_1 + i0_0) + h = T.axis.spatial(8, i1_0 * 4 + i1_1_1 + i1_2 + i1_3) + w = T.axis.spatial(8, (i2_0 * 4 + i2_1_1) * 2 + i2_2 + i2_3) + cap_i = T.axis.spatial(4, (i3_0 + i3_1_1 + i3_2) * 4 + i3_3) + cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1_1 + i4_2 + i4_3) + co = T.axis.spatial(32, (i5_0 * 2 + i5_1_1 + i5_2) * 16 + i5_3) + rh = T.axis.reduce(3, i6_0 * 3 + i6_1) + rw = T.axis.reduce(3, i7_0 + i7_1) + cap_k = T.axis.reduce(4, i8_0 + i8_1) + rc = T.axis.reduce(32, i9_0 * 32 + i9_1) + T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co]) + T.writes(conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co] = T.float32(0) + conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co] + PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight[rh, rw, cap_k, cap_j, rc, co] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [2, 4, 1, 1]), + ("SamplePerfectTile", [1, 4, 2, 1]), + ("SamplePerfectTile", [1, 1, 1, 4]), + ("SamplePerfectTile", [1, 4, 1, 1]), + ("SamplePerfectTile", [1, 2, 1, 16]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [3, 1]), + ("SamplePerfectTile", [4, 1]), + ("SamplePerfectTile", [1, 32]), + ("SampleCategorical", 0), + ("SampleComputeLocation", 7), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [2, 4, 1, 1]), + ("SamplePerfectTile", [1, 4, 2, 1]), + ("SamplePerfectTile", [1, 1, 1, 4]), + ("SamplePerfectTile", [1, 4, 1, 1]), + ("SamplePerfectTile", [1, 2, 1, 16]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [3, 1]), + ("SamplePerfectTile", [4, 1]), + ("SamplePerfectTile", [1, 32]), + ("SampleCategorical", 0), + ("SampleComputeLocation", 11), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [2, 4, 1, 1]), + ("SamplePerfectTile", [1, 4, 2, 1]), + ("SamplePerfectTile", [1, 1, 1, 4]), + ("SamplePerfectTile", [1, 4, 1, 1]), + ("SamplePerfectTile", [1, 2, 1, 16]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [3, 1]), + ("SamplePerfectTile", [4, 1]), + ("SamplePerfectTile", [1, 32]), + ("SampleCategorical", 1), + ("SampleComputeLocation", -1), + ] + mod = create_te_workload("CAP", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cap_0, cap_1, cap_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + if __name__ == "__main__": test_cpu_c1d() test_cpu_c2d() test_cpu_c3d() + test_cpu_cap() diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index 277f74d888d01..bffb80436cad5 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -295,7 +295,109 @@ def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7 ) +def test_cuda_cap(): + # fmt: off + @T.prim_func + def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.unroll_explicit":64}) + conv2d_capsule_nhwijc_local = T.alloc_buffer([1, 8, 8, 4, 4, 32], dtype="float32", scope="local") + PadInput_shared = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32", scope="shared") + weight_shared = T.alloc_buffer([3, 3, 4, 4, 32, 32], dtype="float32", scope="shared") + for i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused in T.thread_binding(256, thread="blockIdx.x"): + for i0_1_i1_1_i2_1_i3_1_i4_1_i5_1_fused in T.thread_binding(1, thread="vthread.x"): + for i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused in T.thread_binding(4, thread="threadIdx.x"): + for i6_0, i7_0, i8_0, i9_0 in T.grid(3, 3, 2, 8): + for ax0_ax1_ax2_ax3_ax4_ax5_fused in T.serial(48): + with T.block("PadInput_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(18, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused // 64 * 4 + i6_0 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 48 // 16) + v2 = T.axis.spatial(18, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 * 2 + i7_0 + 0) + v3 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 16 // 8) + v4 = T.axis.spatial(4, i8_0 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 8 // 4) + v5 = T.axis.spatial(32, i9_0 * 4 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 4) + T.reads(inputs[v0, v1 - 1, v2 - 1, v3, v4, v5]) + T.writes(PadInput_shared[v0, v1, v2, v3, v4, v5]) + T.block_attr({"meta_schedule.cooperative_fetch":2}) + PadInput_shared[v0, v1, v2, v3, v4, v5] = T.if_then_else(1 <= v1 and v1 < 17 and 1 <= v2 and v2 < 17, inputs[v0, v1 - 1, v2 - 1, v3, v4, v5], T.float32(0), dtype="float32") + for ax0_ax1_ax2_ax3_ax4_ax5_fused in T.serial(256): + with T.block("weight_shared"): + v0, v1 = T.axis.remap("SS", [i6_0, i7_0]) + v2 = T.axis.spatial(4, i8_0 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_fused // 128) + v3 = T.axis.spatial(4, ax0_ax1_ax2_ax3_ax4_ax5_fused % 128 // 32) + v4 = T.axis.spatial(32, i9_0 * 4 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 32 // 8) + v5 = T.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 * 8 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 8) + T.reads(weight[v0, v1, v2, v3, v4, v5]) + T.writes(weight_shared[v0, v1, v2, v3, v4, v5]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + weight_shared[v0, v1, v2, v3, v4, v5] = weight[v0, v1, v2, v3, v4, v5] + for i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3, i6_2, i7_2, i8_2, i9_2, i0_4, i1_4, i2_4, i3_4, i4_4, i5_4 in T.grid(1, 1, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 8): + with T.block("conv2d_capsule_nhwijc"): + n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0) + h = T.axis.spatial(8, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 256 // 64 + 0 + 0) * 2 + i1_3 + i1_4) + w = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 + 0 + 0 + i2_3 + i2_4) + cap_i = T.axis.spatial(4, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 4 // 2 + i3_3 + i3_4) + cap_j = T.axis.spatial(4, ((0 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 2 + i4_3) * 2 + i4_4) + co = T.axis.spatial(32, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 + 0 + 0 + i5_3) * 8 + i5_4) + rh = T.axis.reduce(3, i6_0 + i6_1 + i6_2) + rw = T.axis.reduce(3, i7_0 + i7_1 + i7_2) + cap_k = T.axis.reduce(4, (i8_0 + i8_1) * 2 + i8_2) + rc = T.axis.reduce(32, i9_0 * 4 + i9_1 + i9_2) + T.reads(PadInput_shared[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight_shared[rh, rw, cap_k, cap_j, rc, co]) + T.writes(conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co]) + T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) + with T.init(): + conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co] = T.float32(0) + conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co] + PadInput_shared[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight_shared[rh, rw, cap_k, cap_j, rc, co] + for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 2, 1, 1, 2, 8): + with T.block("conv2d_capsule_nhwijc_local"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused // 64 * 2 + ax1) + v2 = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 + ax2) + v3 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused // 2 + ax3) + v4 = T.axis.spatial(4, i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 2 * 2 + ax4) + v5 = T.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 * 8 + ax5) + T.reads(conv2d_capsule_nhwijc_local[v0, v1, v2, v3, v4, v5]) + T.writes(conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5]) + conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5] = conv2d_capsule_nhwijc_local[v0, v1, v2, v3, v4, v5] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [4, 1, 1, 2, 1]), + ("SamplePerfectTile", [8, 1, 1, 1, 1]), + ("SamplePerfectTile", [2, 1, 2, 1, 1]), + ("SamplePerfectTile", [1, 1, 2, 1, 2]), + ("SamplePerfectTile", [4, 1, 1, 1, 8]), + ("SamplePerfectTile", [3, 1, 1]), + ("SamplePerfectTile", [3, 1, 1]), + ("SamplePerfectTile", [2, 1, 2]), + ("SamplePerfectTile", [8, 4, 1]), + ("SampleCategorical", 1), + ("SampleCategorical", 3), + ("SampleCategorical", 2), + ] + mod = create_te_workload("CAP", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cap_0], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": test_cuda_c1d() test_cuda_c2d() test_cuda_c3d() + test_cuda_cap() From 927620e20fe226578fa0c32b9706d27874791b83 Mon Sep 17 00:00:00 2001 From: Ivy Zhang Date: Mon, 11 Jul 2022 08:55:57 +0800 Subject: [PATCH 079/111] [BYOC-DNNL] support more post-ops (#12002) * support post-op swish * support post-op clip * enhance get_shape and get_dtype in dnnl.py to support efficientnet * add checks for with_eltwise whether in supported list * fix lint * fix test --- python/tvm/relay/op/contrib/dnnl.py | 22 ++- src/relay/backend/contrib/dnnl/codegen.cc | 9 ++ src/relay/backend/utils.h | 4 + src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 12 +- tests/python/contrib/test_dnnl.py | 126 +++++------------- .../python/relay/test_pass_partition_graph.py | 26 +++- 6 files changed, 95 insertions(+), 104 deletions(-) diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py index b3ef478f201db..9b6b45240a50c 100644 --- a/python/tvm/relay/op/contrib/dnnl.py +++ b/python/tvm/relay/op/contrib/dnnl.py @@ -51,6 +51,7 @@ logger = logging.getLogger("DNNL") +supported_post_elts = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", None] def _register_external_op_helper(op_name, supported=True): @@ -120,6 +121,8 @@ def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None): conv_out : CallPattern Call node sequence. """ + if with_eltwise not in supported_post_elts: + raise ValueError("Unsupported eltwise post-op: %s" % with_eltwise) data = wildcard() weight = wildcard() bias = wildcard() @@ -128,8 +131,11 @@ def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None): conv_out = is_op("add")(conv, bias) else: conv_out = conv - if with_eltwise: - return is_op(with_eltwise)(conv_out) + if with_eltwise == "swish": + sig_out = is_op("sigmoid")(conv_out) + conv_out = is_op("multiply")(conv_out, sig_out) + elif with_eltwise: + conv_out = is_op(with_eltwise)(conv_out) return conv_out @@ -147,6 +153,8 @@ def make_dense_pattern(with_bias=True, with_eltwise=None): dense_out : CallPattern Call node sequence. """ + if with_eltwise not in supported_post_elts: + raise ValueError("Unsupported eltwise post-op: %s" % with_eltwise) data = wildcard() weight = wildcard() bias = wildcard() @@ -165,6 +173,9 @@ def make_dense_pattern(with_bias=True, with_eltwise=None): added_erf_val = is_op("add")(erf_val, const2) mul_val = is_op("multiply")(dense_out, added_erf_val) dense_out = is_op("multiply")(mul_val, const3) + elif with_eltwise == "swish": + sig_out = is_op("sigmoid")(dense_out) + dense_out = is_op("multiply")(dense_out, sig_out) elif with_eltwise: dense_out = is_op(with_eltwise)(dense_out) return dense_out @@ -191,6 +202,7 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise): pat_name = "dnnl.deconv" + op_name.split("_")[0][-2::] pat_name += "_bias" if with_bias else "" pat_name += ("_" + with_eltwise.split(".")[-1]) if with_eltwise else "" + pat_name = pat_name.replace("_swish", "_sigmoid_mul") if "conv" in op_name: dnnl_pattern = (pat_name, make_conv_pattern(op_name, with_bias, with_eltwise)) elif op_name == "nn.dense": @@ -282,7 +294,7 @@ def pattern_table(): dnnl_patterns.append(make_qnn_conv2d_pattern()) dnnl_patterns.append(make_qnn_dense_pattern()) - elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None] + elt_list = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", None] for with_bias in [True, False]: for elt in elt_list: if not with_bias and not elt: @@ -380,6 +392,8 @@ def get_shape(tensor): if isinstance(tensor, tvm.ir.container.Array): return tensor[-1].shape if isinstance(tensor, relay.expr.Call): + if tensor.op.name == "multiply": + return tensor.type_args[0].shape return tensor.checked_type.shape raise TypeError("Unsupport data type: %s" % type(tensor)) @@ -395,6 +409,8 @@ def get_dtype(tensor): if isinstance(tensor, tvm.ir.container.Array): return tensor[-1].dtype if isinstance(tensor, relay.expr.Call): + if tensor.op.name == "multiply": + return tensor.type_args[0].dtype return tensor.checked_type.dtype raise TypeError("Unsupport data type: %s" % type(tensor)) diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc index 2f47c23a7cf9b..4abfc9d9b136e 100644 --- a/src/relay/backend/contrib/dnnl/codegen.cc +++ b/src/relay/backend/contrib/dnnl/codegen.cc @@ -470,6 +470,8 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer { {"relu", "nn.relu"}, {"tanh", "tanh"}, {"sigmoid", "sigmoid"}, + {"clip", "clip"}, + {"mul", "multiply"}, {"nn.deconv2d", "nn.conv2d_transpose"}, {"nn.deconv3d", "nn.conv3d_transpose"}, }; @@ -566,6 +568,13 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer { "kernel", /* op_type_ */ inputs, 1 /* num_outputs_ */); SetCallNodeAttribute(node, call); + // If has post-op `clip`. Assume the last op is clip, add clip's attrs to the pattern attrs. + if (name.find("_clip") != std::string::npos) { + auto clip_call = cn->op.as()->body.as(); + ICHECK(IsOp(clip_call, "clip")); + SetCallNodeAttribute(node, clip_call); + } + // For QNN. for (const auto& kvp : extra_attrs) node->SetAttr(kvp.first, kvp.second); return AddNode(node, GetRef(cn)); diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index d6fae8c72b5e6..57c066131181a 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -470,6 +470,10 @@ inline const CallNode* GetRootCall(const CallNode* current_call, int depth, current_call->args[valid_node_idx].as()) { valid_node_idx++; } + while (valid_node_idx < current_call->args.size() && + !(IsOp(current_call->args[valid_node_idx].as(), expected_op_names[depth - 1]))) { + valid_node_idx++; + } const auto* next_call = current_call->args[valid_node_idx].as(); return GetRootCall(next_call, depth - 1, expected_op_names); } diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc index a46f170fea949..6c0fd64066e56 100644 --- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc +++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc @@ -189,6 +189,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase { std::regex relu_pat(".*_relu.*"); std::regex tanh_pat(".*_tanh.*"); std::regex sigmoid_pat(".*_sigmoid.*"); + std::regex clip_pat(".*_clip.*"); std::regex gelu_pat(".*_gelu.*"); // Parsing post-ops. @@ -199,8 +200,17 @@ class DNNLJSONRuntime : public JSONRuntimeBase { if (std::regex_match(op_name, tanh_pat)) { ops.append_eltwise(1.f, dnnl::algorithm::eltwise_tanh, 0.f, 0.f); } + if (std::regex_match(op_name, clip_pat)) { + float a_min = GetNodeAttr(nodes_[nid], "a_min"); + float a_max = GetNodeAttr(nodes_[nid], "a_max"); + ops.append_eltwise(1.f, dnnl::algorithm::eltwise_clip, a_min, a_max); + } if (std::regex_match(op_name, sigmoid_pat)) { - ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f); + if (op_name.find("_sigmoid_mul") != std::string::npos) { + ops.append_eltwise(1.f, dnnl::algorithm::eltwise_swish, 1.f, 1.f); + } else { + ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f); + } } if (std::regex_match(op_name, gelu_pat)) { ops.append_eltwise(1.f, dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f); diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py index 078483798c6dd..6c7034741a37d 100755 --- a/tests/python/contrib/test_dnnl.py +++ b/tests/python/contrib/test_dnnl.py @@ -192,7 +192,6 @@ def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, te if use_dnnl: processed_mod = partition_for_dnnl(processed_mod, params, alter_layout) check_dnnl_used(processed_mod) - with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=processed_mod, device=dev, target=target @@ -237,6 +236,23 @@ def run_and_verify_func( ) +def add_activation(activation, out, dic, param_lst): + if activation == "relu": + return relay.nn.relu(out), dic, param_lst + elif activation == "tanh": + return relay.tanh(out), dic, param_lst + elif activation == "sigmoid": + return relay.sigmoid(out), dic, param_lst + elif activation == "clip": + return relay.clip(out, 0.0, 6.0), dic, param_lst + elif activation == "swish": + sig_out = relay.sigmoid(out) + out = relay.multiply(out, sig_out) + return out, dic, param_lst + else: + return out, dic, param_lst + + def get_conv1d( x_shape=((1, 3, 224)), k_shape=(16, 3, 3), @@ -262,15 +278,7 @@ def get_conv1d( ) dic = {"x": x_shape, "kernel": k_shape} param_lst = ["kernel"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"): @@ -279,15 +287,7 @@ def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dt out = relay.nn.bias_add(conv, bias) dic["bias"] = (k_shape[0],) param_lst += ["bias"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"): @@ -334,15 +334,7 @@ def get_conv2d( ) dic = {"x": x_shape, "kernel": k_shape} param_lst = ["kernel"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv2d_transpose( @@ -367,15 +359,7 @@ def get_conv2d_transpose( ) dic = {"x": x_shape, "kernel": k_shape} param_lst = ["kernel"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv2d_weights_const( @@ -412,15 +396,7 @@ def get_conv2d_bias( out = relay.nn.bias_add(conv, bias) dic["bias"] = (k_shape[0],) param_lst += ["bias"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv2d_transpose_bias( @@ -431,15 +407,7 @@ def get_conv2d_transpose_bias( out = relay.nn.bias_add(conv, bias) dic["bias"] = (k_shape[1],) param_lst += ["bias"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"): @@ -503,15 +471,7 @@ def get_conv3d( ) dic = {"x": x_shape, "kernel": k_shape} param_lst = ["kernel"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv3d_transpose( @@ -542,15 +502,7 @@ def get_conv3d_transpose( ) dic = {"x": x_shape, "kernel": k_shape} param_lst = ["kernel"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv3d_bias( @@ -561,15 +513,7 @@ def get_conv3d_bias( out = relay.nn.bias_add(conv, bias) dic["bias"] = (k_shape[0],) param_lst += ["bias"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def get_conv3d_transpose_bias( @@ -580,15 +524,7 @@ def get_conv3d_transpose_bias( out = relay.nn.bias_add(conv, bias) dic["bias"] = (k_shape[1],) param_lst += ["bias"] - - if activation == "relu": - return relay.nn.relu(out), dic, param_lst - elif activation == "tanh": - return relay.tanh(out), dic, param_lst - elif activation == "sigmoid": - return relay.sigmoid(out), dic, param_lst - else: - return out, dic, param_lst + return add_activation(activation, out, dic, param_lst) def gelu_helper(data): @@ -797,7 +733,7 @@ def test_conv2d_weights_const(run_module, dtype="float32"): def test_conv2d_pattern(run_module, dtype="float32"): x_shape = (1, 32, 8, 8) k_shape = (16, 32, 3, 3) - activation_lst = [None, "relu", "tanh", "sigmoid"] + activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"] for a in activation_lst: conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype) conv2d = tvm.IRModule.from_expr(conv2d) @@ -839,7 +775,7 @@ def test_conv2d_transpose(run_module, dtype="float32"): def test_conv2d_transpose_pattern(run_module, dtype="float32"): - activation_lst = [None, "relu", "tanh", "sigmoid"] + activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"] for a in activation_lst: conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype) conv2d = tvm.IRModule.from_expr(conv2d) @@ -872,7 +808,7 @@ def test_conv3d(run_module, dtype="float32"): def test_conv3d_pattern(run_module, dtype="float32"): - activation_lst = [None, "relu", "tanh", "sigmoid"] + activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"] for a in activation_lst: conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype) conv3d = tvm.IRModule.from_expr(conv3d) @@ -905,7 +841,7 @@ def test_conv3d_transpose(run_module, dtype="float32"): def test_conv3d_transpose_pattern(run_module, dtype="float32"): - activation_lst = [None, "relu", "tanh", "sigmoid"] + activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"] for a in activation_lst: conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype) conv3d = tvm.IRModule.from_expr(conv3d) diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py index 58b41189a0f0c..4b7ac92136e9b 100644 --- a/tests/python/relay/test_pass_partition_graph.py +++ b/tests/python/relay/test_pass_partition_graph.py @@ -919,6 +919,7 @@ def expected(): def test_dnnl_fuse(): dnnl_patterns = get_pattern_table("dnnl") + dnnl_pat_dic = dict(dnnl_patterns) ( conv2d_bias_relu_pat, conv2d_bias_sigmoid_pat, @@ -926,11 +927,26 @@ def test_dnnl_fuse(): conv2d_relu_pat, conv2d_sigmoid_pat, ) = ( - dnnl_patterns[3], - dnnl_patterns[15], - dnnl_patterns[22], - dnnl_patterns[28], - dnnl_patterns[40], + ( + "dnnl.conv2d_bias_relu", + dnnl_pat_dic["dnnl.conv2d_bias_relu"], + ), + ( + "dnnl.conv2d_bias_sigmoid", + dnnl_pat_dic["dnnl.conv2d_bias_sigmoid"], + ), + ( + "dnnl.conv2d_bias", + dnnl_pat_dic["dnnl.conv2d_bias"], + ), + ( + "dnnl.conv2d_relu", + dnnl_pat_dic["dnnl.conv2d_relu"], + ), + ( + "dnnl.conv2d_sigmoid", + dnnl_pat_dic["dnnl.conv2d_sigmoid"], + ), ) def get_blocks( From cf15375e20cd1fb2e201e47afda169732ef88eea Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Sun, 10 Jul 2022 20:05:21 -0500 Subject: [PATCH 080/111] Several type mismatch fixes and checks (#12041) * Compute common type for shape elements in BroadcastHelper The corresponding dimensions in the input/output tensors in a broadcast operations may have the same value, but different types (e.g. int32 vs int64). When the broadcast helper tries to unify the dimensions it also needs to compute the common type to hold the dimension. * Cast and simplify both members of `Range` Only the `min` member was type-casted, which could lead to ranges with different types for `min` and `extent`. Move the casts to the argument of Simplify, so that they can be eliminated if they aren't needed. * Type-check iv domain ranges, use cast only if needed in MakeLoopNest In some cases the domain ranges had the `min` and the `extent` values be of different types (e.g. [(int64)0, 32)). This is an error, and it can lead to compilation failures later on. Add a check for equal types here to catch this early. Also, only add the cast operation when the desired type differs from the current one to keep the expressions simpler. * Check that variable and substituted expression have same types Add a check to IRSubstitute to detect when the type of a variable and the type of the expression to replace it with have different types. * Add testcase * [TVMScript] Use void for lambda parameters, allow mismatch in Substitute When the script parser deals with lambdas, it creates Var objects for each parameter. Their actual types are not known at the time, and the properly typed variables are subtituted in the body later. Since the default dtype of a Var is "int32", this could lead to a type mismatch in Substitute. To deal with this scenario, use "void" for newly created Vars in the parser, and add an exception to Substitute to allow replacing void Vars with expressions of any type. * Fix type error in test_reduce_combiner_simplify * Restart CI Co-authored-by: Jiawei Liu --- include/tvm/topi/detail/broadcast.h | 30 ++++++++--- python/tvm/script/parser.py | 5 +- src/te/operation/op_utils.cc | 52 ++++++++++--------- src/te/operation/op_utils.h | 12 ++--- src/te/schedule/bound.cc | 7 +-- src/tir/ir/stmt_functor.cc | 13 ++++- tests/python/relay/test_op_level10.py | 17 ++++++ .../unittest/test_arith_canonical_simplify.py | 2 +- 8 files changed, 93 insertions(+), 45 deletions(-) diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h index 5c701825840c8..c861fbb71b2a3 100644 --- a/include/tvm/topi/detail/broadcast.h +++ b/include/tvm/topi/detail/broadcast.h @@ -42,6 +42,12 @@ struct BroadcastHelper { std::deque vars2; }; +static inline DataType CommonType(DataType type1, DataType type2) { + ICHECK(type1.is_scalar() && type2.is_scalar()); + ICHECK(type1.code() == type2.code()); + return DataType(type1.code(), std::max(type1.bits(), type2.bits()), /*lanes=*/1); +} + inline BroadcastHelper BroadcastShape(const tvm::Array& shape1, const tvm::Array& shape2) { BroadcastHelper bh; @@ -49,32 +55,40 @@ inline BroadcastHelper BroadcastShape(const tvm::Array& shape1, int s2_size = shape2.size(); tvm::PrimExpr one(1); int i; + + auto cast_if_needed = [](DataType to_type, PrimExpr expr) { + return to_type != expr.dtype() ? cast(to_type, expr) : expr; + }; + for (i = 1; i <= std::min(s1_size, s2_size); ++i) { // TODO(@icemelon9): Need to revisit this part const IntImmNode* static_size1 = shape1[s1_size - i].as(); const IntImmNode* static_size2 = shape2[s2_size - i].as(); - bh.all_vars.push_front(tvm::tir::Var()); + DataType common_type = CommonType(shape1[s1_size - i].dtype(), shape2[s2_size - i].dtype()); + + bh.all_vars.push_front(tvm::tir::Var("dim", common_type)); if (topi::detail::EqualCheck(shape1[s1_size - i], shape2[s2_size - i])) { - bh.common_shape.push_front(shape1[s1_size - i]); + bh.common_shape.push_front(cast_if_needed(common_type, shape1[s1_size - i])); bh.vars1.push_front(bh.all_vars[0]); bh.vars2.push_front(bh.all_vars[0]); } else if (topi::detail::EqualCheck(one, shape1[s1_size - i])) { ICHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i])); - bh.common_shape.push_front(shape2[s2_size - i]); + bh.common_shape.push_front(cast_if_needed(common_type, shape2[s2_size - i])); bh.vars2.push_front(bh.all_vars[0]); } else if (topi::detail::EqualCheck(one, shape2[s2_size - i])) { - bh.common_shape.push_front(shape1[s1_size - i]); + bh.common_shape.push_front(cast_if_needed(common_type, shape1[s1_size - i])); bh.vars1.push_front(bh.all_vars[0]); } else if (!static_size1 && !static_size2) { - bh.common_shape.push_front(max(shape1[s1_size - i], shape2[s2_size - i])); + bh.common_shape.push_front( + cast_if_needed(common_type, max(shape1[s1_size - i], shape2[s2_size - i]))); bh.vars1.push_front(bh.all_vars[0]); bh.vars2.push_front(bh.all_vars[0]); } else if (!static_size1) { - bh.common_shape.push_front(shape2[s2_size - i]); + bh.common_shape.push_front(cast_if_needed(common_type, shape2[s2_size - i])); bh.vars2.push_front(bh.all_vars[0]); bh.vars1.push_front(bh.all_vars[0]); } else if (!static_size2) { - bh.common_shape.push_front(shape1[s1_size - i]); + bh.common_shape.push_front(cast_if_needed(common_type, shape1[s1_size - i])); bh.vars1.push_front(bh.all_vars[0]); bh.vars2.push_front(bh.all_vars[0]); } else { @@ -89,7 +103,7 @@ inline BroadcastHelper BroadcastShape(const tvm::Array& shape1, auto& shape = (s1_size > s2_size) ? shape1 : shape2; auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2; for (; i <= max_size; ++i) { - bh.all_vars.push_front(tvm::tir::Var()); + bh.all_vars.push_front(tvm::tir::Var("v", shape[max_size - 1].dtype())); bh.common_shape.push_front(shape[max_size - i]); vars.push_front(bh.all_vars[0]); } diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index e4bdd12065066..0932e717bbec2 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -526,7 +526,10 @@ def transform_Lambda(self, node): # add parameters of the lambda arg_vars = [] for arg in node.params: - arg_var = tvm.te.var(arg.name) + # Use "void" for dtype here. The actual type is not yet known and will be + # determined later. Using void type will allow IRSubstitute to do the + # replacement without flagging a type-mismatch error. + arg_var = tvm.te.var(arg.name, dtype="") arg_vars.append(arg_var) self.context.update_symbol(arg.name, arg_var, node) diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc index fd2a5c89f324f..8644e75ff056d 100644 --- a/src/te/operation/op_utils.cc +++ b/src/te/operation/op_utils.cc @@ -38,18 +38,16 @@ namespace te { using namespace arith; using namespace tir; -DataType LargerDataType(DataType a, DataType b) { return a.bits() > b.bits() ? a : b; } - -std::vector > MakeLoopNest(const Stage& stage, - const std::unordered_map& dom_map, - size_t begin_iter_pos, bool new_loop_var, - const std::unordered_set& skip_iter, - std::unordered_map* p_value_map, - bool debug_keep_trivial_loop) { +std::vector> MakeLoopNest(const Stage& stage, + const std::unordered_map& dom_map, + size_t begin_iter_pos, bool new_loop_var, + const std::unordered_set& skip_iter, + std::unordered_map* p_value_map, + bool debug_keep_trivial_loop) { auto leaf_iter_vars = stage->leaf_iter_vars; Stmt no_op = Evaluate(0); // create the loop nest - std::vector > nest; + std::vector> nest; nest.resize(leaf_iter_vars.size() + 1); std::unordered_map& value_map = *p_value_map; @@ -69,6 +67,10 @@ std::vector > MakeLoopNest(const Stage& stage, Range dom = dom_map.at(iv); + ICHECK(iv->var.dtype() == dom->min.dtype() && iv->var.dtype() == dom->extent.dtype()) + << "iter_var type " << iv->var.dtype() << " and domain types (min:" << dom->min.dtype() + << ", extent:" << dom->extent.dtype() << ") should all be the same"; + // This is a hack to ensure that the replacing expression has the same // dtype as the replacing expression. This happens when a thread/block // itervar is bound to another itervar. Because the thread/block itervar @@ -78,7 +80,9 @@ std::vector > MakeLoopNest(const Stage& stage, // bound to (in `bind`) but that would require inplace modification of the // itervar. // XXX: we will get integer overflow if the bound itervar is greater than int32::max. - auto promote_to_bound_dtype = [&iv](PrimExpr e) { return cast(iv->var.dtype(), e); }; + auto promote_to_iv_dtype = [type = iv->var.dtype()](PrimExpr e) { + return type != e.dtype() ? cast(type, e) : e; + }; // initialize the offset and loop_level Var var = bind_iv->var; @@ -125,15 +129,15 @@ std::vector > MakeLoopNest(const Stage& stage, } } if (!debug_keep_trivial_loop && is_one(dom->extent)) { - nest[i + 1].emplace_back(LetStmt(var, promote_to_bound_dtype(dom->min), no_op)); - value_map[iv] = promote_to_bound_dtype(dom->min); + nest[i + 1].emplace_back(LetStmt(var, dom->min, no_op)); + value_map[iv] = dom->min; } else if (is_zero(dom->min)) { - nest[i + 1].emplace_back(For(var, 0, promote_to_bound_dtype(dom->extent), kind, no_op)); - value_map[iv] = promote_to_bound_dtype(var); + nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op)); + value_map[iv] = promote_to_iv_dtype(var); } else { Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype()); - nest[i + 1].emplace_back(For(idx, 0, promote_to_bound_dtype(dom->extent), kind, no_op)); - PrimExpr new_value = promote_to_bound_dtype(dom->min + idx); + nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op)); + PrimExpr new_value = dom->min + idx; value_map[iv] = new_value; nest[i + 1].emplace_back(LetStmt(var, new_value, no_op)); } @@ -152,7 +156,7 @@ std::vector > MakeLoopNest(const Stage& stage, ICHECK(is_positive_const(dom->extent)); // annotate the extent of the IterVar nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread, dom->extent, no_op)); - value_map[iv] = promote_to_bound_dtype(var); + value_map[iv] = promote_to_iv_dtype(var); } else if (bind_iv->thread_tag == "pipeline") { // pipeline marker. ICHECK(is_zero(dom->min)); @@ -160,7 +164,7 @@ std::vector > MakeLoopNest(const Stage& stage, // annotate the extent of the IterVar nest[i + 1].emplace_back( AttrStmt(bind_iv, tir::attr::pipeline_exec_scope, dom->extent, no_op)); - value_map[iv] = promote_to_bound_dtype(dom->min); + value_map[iv] = dom->min; } else { // Always restrict threaded IterVar to starts from 0. ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at " @@ -168,28 +172,28 @@ std::vector > MakeLoopNest(const Stage& stage, // annotate the extent of the IterVar nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op)); if (!debug_keep_trivial_loop && is_one(dom->extent)) { - value_map[iv] = promote_to_bound_dtype(dom->min); + value_map[iv] = dom->min; } else if (stage->scope == "") { - value_map[iv] = promote_to_bound_dtype(var); + value_map[iv] = promote_to_iv_dtype(var); } else { runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag); runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope); if (static_cast(ss.rank) <= ts.rank) { - value_map[iv] = promote_to_bound_dtype(var); + value_map[iv] = promote_to_iv_dtype(var); } else if (stage->scope == "warp" && ts.rank == 1) { // To determine whether a thread index is inside or outside a warp, we need // to know the thread extent. We leave a warning for now. if (ts.dim_index == 0) { - value_map[iv] = promote_to_bound_dtype(var); + value_map[iv] = promote_to_iv_dtype(var); } else { LOG(WARNING) << "WARNING: threadIdx.y or threadIdx.z accessing warp-scope memory detected. " << "TVM assumes only threadIdx.x indicates threads inside a warp, " << "while threadIdx.y and threadIdx.z indicates different warps."; - value_map[iv] = promote_to_bound_dtype(dom->min); + value_map[iv] = dom->min; } } else { - value_map[iv] = promote_to_bound_dtype(dom->min); + value_map[iv] = dom->min; } } } diff --git a/src/te/operation/op_utils.h b/src/te/operation/op_utils.h index 02f4a860a01d9..f2e5782bf46f9 100644 --- a/src/te/operation/op_utils.h +++ b/src/te/operation/op_utils.h @@ -51,12 +51,12 @@ using tir::MergeNest; * \param p_value_map The result value of each IterVar. * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1 */ -std::vector > MakeLoopNest(const Stage& stage, - const std::unordered_map& dom_map, - size_t begin_iter_pos, bool new_loop_var, - const std::unordered_set& skip_iter, - std::unordered_map* p_value_map, - bool debug_keep_trivial_loop); +std::vector> MakeLoopNest(const Stage& stage, + const std::unordered_map& dom_map, + size_t begin_iter_pos, bool new_loop_var, + const std::unordered_set& skip_iter, + std::unordered_map* p_value_map, + bool debug_keep_trivial_loop); /*! * \brief Create a nest of if checking the predicates. diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc index 87a175a344378..d8abffd6aa06e 100644 --- a/src/te/schedule/bound.cc +++ b/src/te/schedule/bound.cc @@ -247,10 +247,11 @@ Map InferBound(const Schedule& sch) { } } for (auto it = ret.begin(); it != ret.end(); it++) { + DataType var_type = it->first->var.dtype(); it->second = Range::FromMinExtent( - analyzer.Simplify(it->second->min), - // The range associated with each itervar must have the same dtype as it - cast(it->first->var.dtype(), analyzer.Simplify(it->second->extent))); + // The range associated with each itervar must have the same dtype as the var + analyzer.Simplify(cast(var_type, it->second->min)), + analyzer.Simplify(cast(var_type, it->second->extent))); } return Map(ret.begin(), ret.end()); } diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc index 34bbb4b46ba4e..c0abf953eec2b 100644 --- a/src/tir/ir/stmt_functor.cc +++ b/src/tir/ir/stmt_functor.cc @@ -26,7 +26,7 @@ #include -#include "./functor_common.h" +#include "functor_common.h" namespace tvm { namespace tir { @@ -647,7 +647,16 @@ class IRSubstitute : public StmtExprMutator { PrimExpr VisitExpr_(const VarNode* op) final { Var var = GetRef(op); auto ret = vmap_(var); - if (ret.defined()) return ret.value(); + if (ret.defined()) { + // Allow substitution of void variables with any expression. The TVM script parser + // uses void variables for lambda parameters (since exact types are not known yet). + if (!var.dtype().is_void()) { + PrimExpr ret_ex = Downcast(ret.value()); + ICHECK(ret_ex.dtype() == var.dtype()) << "substituting " << var << ":" << var.dtype() + << " -> " << ret_ex << ":" << ret_ex.dtype(); + } + return ret.value(); + } return std::move(var); } diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index a2104e79762a6..8c30ab27ce18d 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -262,6 +262,23 @@ def test_broadcast_concat_shape_int64(executor_kind): tvm.testing.assert_allclose(op_res.numpy(), ref_res) +def test_broadcast_pool2d_shape_int64(executor_kind): + x_shape = (1, 3, 32, 32) + out_shape = (2, 3, 32, 32) + x = relay.var("data", shape=x_shape, dtype="float32") + broadcast_to = relay.broadcast_to(x, shape=relay.const([2, 3, 32, 32], dtype="int64")) + pool2d = relay.nn.max_pool2d(broadcast_to, pool_size=(3, 3), padding=(1, 1, 1, 1)) + sub = relay.subtract(broadcast_to, pool2d) + + f = relay.Function([x], sub) + x = np.ones(x_shape).astype("float32") + ref_res = np.zeros(out_shape).astype("float32") + + for target, dev in tvm.testing.enabled_targets(): + op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x) + tvm.testing.assert_allclose(op_res.numpy(), ref_res) + + @tvm.testing.uses_gpu def test_broadcast_to_like(executor_kind): shape = (4, 1, 6) diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py index 74c8bcb5fddf8..81a163d0d4314 100644 --- a/tests/python/unittest/test_arith_canonical_simplify.py +++ b/tests/python/unittest/test_arith_canonical_simplify.py @@ -161,7 +161,7 @@ def test_reduce_combiner_simplify(): ) sum_and_prod = comm_reducer( lambda x, y: (x[0] + y[0], x[1] * y[1]), - lambda t0, t1: (tvm.tir.const(0, t0), tvm.tir.const(5, t0) - tvm.tir.const(4, t0)), + lambda t0, t1: (tvm.tir.const(0, t0), tvm.tir.const(5, t1) - tvm.tir.const(4, t1)), ) some_reducer1 = comm_reducer( lambda x, y: ( From fbb7b5d1a0d82acb1f581dd2ec362b4dcad2638e Mon Sep 17 00:00:00 2001 From: Jiabei Zhao <41840745+Sunny-Island@users.noreply.github.com> Date: Mon, 11 Jul 2022 17:00:21 +0800 Subject: [PATCH 081/111] Add xgboost version restriction (#12050) Co-authored-by: jiabeizhao --- docs/install/from_source.rst | 4 ++-- python/gen_requirements.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst index 92795da9a753d..4b5810cb20fdd 100644 --- a/docs/install/from_source.rst +++ b/docs/install/from_source.rst @@ -323,7 +323,7 @@ like ``virtualenv``. .. code:: bash - pip3 install --user tornado psutil xgboost cloudpickle + pip3 install --user tornado psutil 'xgboost<1.6.0' cloudpickle Note on M1 macs, you may have trouble installing xgboost / scipy. scipy and xgboost requires some additional dependencies to be installed, including openblas and its dependencies. Use the following commands to install scipy and xgboost with the required dependencies and @@ -339,7 +339,7 @@ configuration. A workaround for this is to do the following commands: pip install scipy --no-use-pep517 - pip install xgboost + pip install 'xgboost<1.6.0' Install Contrib Libraries ------------------------- diff --git a/python/gen_requirements.py b/python/gen_requirements.py index 6cb92921f34be..7e2c3e2186184 100755 --- a/python/gen_requirements.py +++ b/python/gen_requirements.py @@ -277,7 +277,7 @@ ("torch", None), ("torchvision", None), ("tornado", None), - ("xgboost", ">=1.1.0"), # From PR #4953. + ("xgboost", ">=1.1.0,<1.6.0"), # From PR #4953 & Issue #12009 ] ################################################################################ From 2d5072858c9749217256913cad3c14fe52be0367 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 11 Jul 2022 18:09:58 +0800 Subject: [PATCH 082/111] enable bmm (#12018) --- python/tvm/relay/op/contrib/dnnl.py | 4 +- src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 50 ++++++++++++++++++- tests/python/contrib/test_dnnl.py | 29 +++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py index 9b6b45240a50c..05416bb9a390b 100644 --- a/python/tvm/relay/op/contrib/dnnl.py +++ b/python/tvm/relay/op/contrib/dnnl.py @@ -105,6 +105,7 @@ def _func_wrapper(expr): _register_external_op_helper("add") _register_external_op_helper("multiply") _register_external_op_helper("nn.layer_norm") +_register_external_op_helper("nn.batch_matmul") def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None): @@ -563,6 +564,7 @@ def visit_call(self, call): "nn.conv3d_transpose", "nn.dense", "nn.layer_norm", + "nn.batch_matmul", ] ) if isinstance(call.op, tvm.tir.op.Op): @@ -679,7 +681,7 @@ def __init__(self): const_two = is_expr(relay.const(2)) | is_expr(relay.const(2.0)) p1 = is_op("power")(cdiff, const_two) mp1 = is_op("mean")(p1) | is_op("variance")(self.data, mu) - eps = is_expr(relay.const(1e-5)) + eps = is_expr(relay.const(1e-5)) | is_expr(relay.const(1e-6)) added_eps = is_op("add")(mp1, eps) deno = is_op("sqrt")(added_eps) div_out = is_op("divide")(diff, deno) diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc index 6c0fd64066e56..c6e50eafea86b 100644 --- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc +++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc @@ -269,6 +269,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase { Binary(nid, dnnl::algorithm::binary_mul); } else if ("nn.layer_norm" == op_name) { LayerNorm(nid); + } else if ("nn.batch_matmul" == op_name) { + BatchMatMul(nid); } else { LOG(FATAL) << "Unsupported op: " << op_name; } @@ -483,6 +485,52 @@ class DNNLJSONRuntime : public JSONRuntimeBase { {sum_in_tr, DNNL_ARG_DST}); } + void BatchMatMul(const size_t& nid) { + auto node = nodes_[nid]; + + // Setup attributes. + auto src_tr = GetInput(nid, 0); + auto wgh_tr = GetInput(nid, 1); + auto dst_tr = GetOutput(nid, 0); + auto bias_tr = TensorRequisite{}; + + auto attr = ParseAttrs(nid, &bias_tr); + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + bool transpose_a = GetNodeAttr(node, "transpose_a"); + bool transpose_b = GetNodeAttr(node, "transpose_b"); + + if (transpose_a) { + src_tr = src_tr.Permute({0, 2, 1}); + } + if (transpose_b) { + wgh_tr = wgh_tr.Permute({0, 2, 1}); + } + + // Assumption that bias is correct and can be squeezed to 1D + bias_tr = bias_tr.Reshape({dst_tr.dims()[1]}); + + // Matmul description. + auto bmm_desc = dnnl::matmul::desc(src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(), + bias_tr.LayoutAny().desc(), dst_tr.LayoutAny().desc()); + + // Enable elementwise post-ops. + auto bmm_prim_desc = dnnl::matmul::primitive_desc(bmm_desc, attr, engine_); + + src_tr = src_tr.RequestLayout(bmm_prim_desc.src_desc()); + wgh_tr = wgh_tr.RequestLayout(bmm_prim_desc.weights_desc()); + dst_tr = dst_tr.RequestLayout(bmm_prim_desc.dst_desc()); + bias_tr = bias_tr.RequestLayout(bmm_prim_desc.bias_desc()); + + auto scratchpad_tr = TensorRequisite::AsIs(bmm_prim_desc.scratchpad_desc()); + + Submit(dnnl::matmul(bmm_prim_desc), {{DNNL_ARG_SRC, src_tr}, + {DNNL_ARG_WEIGHTS, wgh_tr}, + {DNNL_ARG_BIAS, bias_tr}, + {DNNL_ARG_SCRATCHPAD, scratchpad_tr}, + {DNNL_ARG_DST, dst_tr}}); + } + void BatchNorm(const size_t& nid) { auto node = nodes_[nid]; @@ -755,7 +803,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase { TensorRequisite GetOutput(const size_t& nid, const int idx) { if (idx == -1) return {}; // -1 reserved value for empty input. - const JSONGraphNode& node = nodes_[nid]; ICHECK_LT(idx, node.GetNumOutput()); @@ -764,6 +811,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase { auto eid = node_row_ptr_[nid] + static_cast(idx); ICHECK(data_entry_[eid] == nullptr); + auto desc = MakePlainDesc(shape, dtype); return TensorRequisite::AsIs(desc, eid).Backward(); diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py index 6c7034741a37d..dfe1b7265de8f 100755 --- a/tests/python/contrib/test_dnnl.py +++ b/tests/python/contrib/test_dnnl.py @@ -556,6 +556,35 @@ def get_dense( return out, dic, param_lst +def get_bmm( + x_shape=(1, 16, 8), k_shape=(1, 4, 8), dtype="float32", transpose_a=False, transpose_b=True +): + x = relay.var("x", shape=(x_shape), dtype=dtype) + kernel = relay.var("kernel", shape=(k_shape), dtype=dtype) + out = relay.nn.batch_matmul( + x, kernel, out_dtype=dtype, transpose_a=transpose_a, transpose_b=transpose_b + ) + dic = {"x": x_shape, "kernel": k_shape} + param_lst = ["kernel"] + return out, dic, param_lst + + +def test_bmm(run_module, dtype="float32"): + x_shape = (1, 2, 4) + k_shape = (1, 3, 4) + + dense, dic, param_lst = get_bmm(x_shape, k_shape, dtype=dtype) + dense = tvm.IRModule.from_expr(dense) + config = dense, dic, param_lst + run_and_verify_func(config, run_module=run_module, dtype=dtype) + + k_shape_t = (1, 4, 3) + dense, dic, param_lst = get_bmm(x_shape, k_shape_t, dtype=dtype, transpose_b=False) + dense = tvm.IRModule.from_expr(dense) + config = dense, dic, param_lst + run_and_verify_func(config, run_module=run_module, dtype=dtype) + + def get_dense_bias( x_shape=(1, 16), k_shape=(32, 16), From 9ee25eb9f45f27d52fdc308dbd8970e5f095fef6 Mon Sep 17 00:00:00 2001 From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com> Date: Mon, 11 Jul 2022 04:32:13 -0700 Subject: [PATCH 083/111] [MetaSchedule] Added a cost model (#11961) In this PR, I added a cost model based on SegmentSum MLP, which can be used for pre-training or integration with TVM. --- .../meta_schedule/cost_model/cost_model.py | 2 +- .../tvm/meta_schedule/cost_model/mlp_model.py | 1010 +++++++++++++++++ src/meta_schedule/database/json_database.cc | 2 +- 3 files changed, 1012 insertions(+), 2 deletions(-) create mode 100644 python/tvm/meta_schedule/cost_model/mlp_model.py diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py index 2fdb9b93494f9..d3b660d837ddd 100644 --- a/python/tvm/meta_schedule/cost_model/cost_model.py +++ b/python/tvm/meta_schedule/cost_model/cost_model.py @@ -190,7 +190,7 @@ def update( raise NotImplementedError def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray: - """Update the cost model given running results. + """Predict given the measure candidates. Parameters ---------- diff --git a/python/tvm/meta_schedule/cost_model/mlp_model.py b/python/tvm/meta_schedule/cost_model/mlp_model.py new file mode 100644 index 0000000000000..04ccca0563f90 --- /dev/null +++ b/python/tvm/meta_schedule/cost_model/mlp_model.py @@ -0,0 +1,1010 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# type: ignore[import] +""" +Segment Sum MLP cost model +""" +import glob +import logging +import math +import os +import random +import tempfile +from collections import OrderedDict +from itertools import chain as itertools_chain +from typing import Dict, List, NamedTuple, Tuple + +import numpy as np # type: ignore +import torch # type: ignore +import tvm + +from ...contrib.tar import tar, untar +from ...runtime import NDArray +from ...target import Target +from ..cost_model import PyCostModel +from ..database import JSONDatabase +from ..feature_extractor import FeatureExtractor, PerStoreFeature +from ..runner import RunnerResult +from ..search_strategy import MeasureCandidate +from ..tune_context import TuneContext +from ..utils import derived_object, shash2hex + +logging.basicConfig() +logger = logging.getLogger("mlp_model") # pylint: disable=invalid-name +logger.setLevel(logging.INFO) + +# pylint: disable=no-member,import-outside-toplevel + + +class SegmentSumMLPConfig(NamedTuple): + """SegmentSum MLP model configuration + + Parameters + ---------- + input_dim : int + The input dim for the model. + hidden_dim : int + The hidden dim for the model. + output_dim : int + The output dim for the model. + use_norm : bool + Whether to normalize the segment sum or not. + use_sigmoid : bool + Whether to use sigmoid on the final output or not. + """ + + input_dim: int = 172 + hidden_dim: int = 256 + output_dim: int = 1 + use_norm: bool = False + use_sigmoid: bool = False + + def to_dict(self): # pylint: disable=missing-function-docstring + return { + "input_dim": self.input_dim, + "hidden_dim": self.hidden_dim, + "output_dim": self.output_dim, + "use_norm": self.use_norm, + "use_sigmoid": self.use_sigmoid, + } + + +class TrainerConfig(NamedTuple): + """Trainer configuration + + Parameters + ---------- + batch_size : int + The batch size. + learning rate : float + The learning rate. + weight decay : float + The weight decay. + num_epoch_full : int + The number of epochs used in full training. + num_epoch_incremental : int + The number of epochs used in incremental training. + grad_clip_norm: float + The norm of gradient clipping. + train_verbose: int + The verbose frequency for training in batches. + test_interval: int + The testing interval in epochs. + test_split: float + The fraction of data for testing. + frozen: bool + Determine whether to re-train the model or not. + """ + + batch_size: int = 128 + learning_rate: float = 7e-4 + weight_decay: float = 1e-6 + num_epoch_full: int = 50 + num_epoch_incremental: int = 5 + grad_clip_norm: float = 0.5 + train_verbose: int = 1000 + test_interval: int = 1 + test_split: float = 0.2 + frozen: bool = False + + def to_dict(self): # pylint: disable=missing-function-docstring + return { + "batch_size": self.batch_size, + "learning_rate": self.learning_rate, + "weight_decay": self.weight_decay, + "num_epoch_full": self.num_epoch_full, + "num_epoch_incremental": self.num_epoch_incremental, + "grad_clip_norm": self.grad_clip_norm, + "train_verbose": self.train_verbose, + "test_interval": self.test_interval, + "test_split": self.test_split, + "frozen": self.frozen, + } + + +# pylint: disable=too-few-public-methods +class FeatureGroup: + """Feature group + + Parameters + ---------- + group_hash : str + The hash of the group + features : List[np.ndarray] + The features + costs : List[float] + The costs + min_cost : float + The minimum cost + """ + + group_hash: str + features: List[np.ndarray] + costs: np.ndarray + min_cost: float + + def __init__( + self, + group_hash: str, + features: List[np.ndarray], + costs: np.ndarray, + ) -> None: + self.group_hash = group_hash + self.features = features + self.costs = costs + self.min_cost = np.min(costs) + + def append( # pylint: disable=missing-function-docstring + self, + features: List[np.ndarray], + costs: np.ndarray, + ) -> None: + self.features.extend(features) + self.costs = np.append(self.costs, costs) + self.min_cost = np.min(self.costs) + + +# pylint: disable=too-many-instance-attributes +class SegmentDataLoader: + """Dataloader for Segment Sum MLP model. + + Parameters + ---------- + features : List[np.ndarray] + The features + results : np.ndarray + The measured results, can be None. + batch_size : int + The batch size + shuffle : bool + Whether to shuffle the dataset or not + """ + + def __init__( + self, + features, + results=None, + batch_size=128, + shuffle=True, + ): + self.batch_size = batch_size + self.shuffle = shuffle + self.data_size = len(features) + + # flatten features and store the starting indices + self.segment_sizes = torch.tensor([len(feature) for feature in features], dtype=torch.int32) + self.feature_offsets = ( + torch.cumsum(self.segment_sizes, 0, dtype=torch.int32) - self.segment_sizes + ) + features = torch.cat([torch.tensor(feature) for feature in features]) + norm, _ = features.max(dim=0) + norm[norm == 0] = 1 + self.features = features / norm + self.results = torch.tensor(results) if results is not None else None + self.iter_order = self.pointer = None + + def __len__(self): + return self.data_size + + def __iter__(self): + if self.shuffle: + self.iter_order = torch.randperm(self.data_size) + else: + self.iter_order = torch.arange(self.data_size) + self.pointer = 0 + return self + + def __next__(self): + if self.pointer >= self.data_size: + raise StopIteration + batch_indices = self.iter_order[self.pointer : self.pointer + self.batch_size] + self.pointer += self.batch_size + return self._fetch_indices(batch_indices) + + def _fetch_indices(self, indices): + segment_sizes, feature_offsets = self.segment_sizes[indices], self.feature_offsets[indices] + feature_indices = torch.empty(segment_sizes.sum(), dtype=torch.int32) + idx = 0 + for offset, seg_size in zip(feature_offsets, segment_sizes): + feature_indices[idx : idx + seg_size] = torch.arange(offset, offset + seg_size) + idx += seg_size + features = self.features[feature_indices.long()] + results = None + if self.results is not None: + results = self.results[indices.long()] + return segment_sizes, features, results + + +def lambda_rank_loss( # pylint: disable=too-many-locals + preds: "torch.Tensor", + labels: "torch.Tensor", + k: int = None, + eps: float = 1e-10, + sigma: float = 1.0, +) -> "torch.Tensor": + """ + LambdaLoss: Metric-Driven Loss for Learning-to-Rank + + Parameters + ---------- + preds : Tensor + The predicted runtime for each candidate. + labels : Tensor + The measured runtime for each candidate. + k : int + Loss for top k. + Default is None, which means computing all scores. + eps : float + The minimum value to the denominator and argument of log if they reach 0. + sigma : float + The scaling factor to the input of the sigmoid function. + + Returns + ------- + loss : Tensor + The lambda rank loss. + """ + device = preds.device + y_pred, y_true = preds[None, :], labels[None, :] + y_pred_sorted, indices_pred = y_pred.sort(descending=True, dim=-1) + y_true_sorted, _ = y_true.sort(descending=True, dim=-1) + true_sorted_by_preds = torch.gather(y_true, dim=1, index=indices_pred) + true_diffs = true_sorted_by_preds[:, :, None] - true_sorted_by_preds[:, None, :] + padded_pairs_mask = torch.isfinite(true_diffs) & (true_diffs > 0) + ndcg_at_k_mask = torch.zeros( + (y_pred.shape[1], y_pred.shape[1]), dtype=torch.bool, device=device + ) + ndcg_at_k_mask[:k, :k] = 1 + true_sorted_by_preds.clamp_(min=0.0) + y_true_sorted.clamp_(min=0.0) + pos_idxs = torch.arange(1, y_pred.shape[1] + 1).to(device) + D = torch.log2(1.0 + pos_idxs.float())[None, :] # pylint: disable=invalid-name + maxDCGs = torch.sum( # pylint: disable=invalid-name + ((torch.pow(2, y_true_sorted) - 1) / D)[:, :k], dim=-1 + ).clamp(min=eps) + G = (torch.pow(2, true_sorted_by_preds) - 1) / maxDCGs[:, None] # pylint: disable=invalid-name + weights = torch.abs( + torch.pow(D[:, :, None], -1.0) - torch.pow(D[:, None, :], -1.0) + ) * torch.abs(G[:, :, None] - G[:, None, :]) + scores_diffs = (y_pred_sorted[:, :, None] - y_pred_sorted[:, None, :]).clamp(min=-1e8, max=1e8) + scores_diffs[torch.isnan(scores_diffs)] = 0.0 + weighted_probs = (torch.sigmoid(sigma * scores_diffs).clamp(min=eps) ** weights).clamp(min=eps) + losses = torch.log2(weighted_probs) + masked_losses = losses[padded_pairs_mask & ndcg_at_k_mask] + loss = -torch.sum(masked_losses) + return loss + + +def topk_score( + pred_results: "torch.Tensor", + gt_results: "torch.Tensor", + k: int, +) -> float: + """ + Evaluate the top-k score + + Parameters + ---------- + pred_results: Tensor + The raw prediction + gt_results: Tensor + The measured results + k : int + The k in top k score + + Returns + ------- + score : float + The top-k score + """ + k = min(k, len(pred_results)) + topk_indices = torch.topk(pred_results, k, largest=False).indices + score = gt_results.min() / gt_results[topk_indices].min() + return score.item() + + +class SegmentSumMLP(torch.nn.Module): + """Segment Sum MLP model. + + Parameters + ---------- + input_dim : int + The input dim for the model. + hidden_dim : int + The hidden dim for the model. + output_dim : int + The output dim for the model. + use_norm : bool + Whether to normalize the segment sum or not. + use_sigmoid : bool + Whether to use sigmoid on the final output or not. + """ + + input_dim: int + hidden_dim: int + output_dim: int + use_norm: bool + use_sigmoid: bool + + def __init__( # pylint: disable=too-many-arguments + self, + input_dim: int = 172, + hidden_dim: int = 256, + output_dim: int = 1, + use_norm: bool = False, + use_sigmoid: bool = False, + ): + from torch import nn # type: ignore + + super().__init__() + self.encoder = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + ) + self.norm = nn.BatchNorm1d(hidden_dim) if use_norm else nn.Identity() + self.layer0 = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + ) + self.layer1 = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + ) + self.decoder = nn.Linear(hidden_dim, output_dim) + self.sigmoid = nn.Sigmoid() if use_sigmoid else nn.Identity() + + def forward( # pylint: disable=missing-function-docstring + self, + segment_sizes: "torch.Tensor", + features: "torch.Tensor", + ) -> "torch.Tensor": + n_seg = len(segment_sizes) + encoded_features = self.encoder(features) + segment_indices = torch.repeat_interleave( + torch.arange(n_seg, device=features.device), + segment_sizes.long(), + ) + n_dim = encoded_features.shape[1] + segment_sum = torch.scatter_add( + input=torch.zeros((n_seg, n_dim), dtype=encoded_features.dtype, device=features.device), + dim=0, + index=segment_indices.view(-1, 1).expand(-1, n_dim), + src=encoded_features, + ) + out = self.norm(segment_sum) + out = self.layer0(out) + out + out = self.layer1(out) + out + out = self.decoder(out).squeeze() + out = self.sigmoid(out) + return out + + +def extract_features( + context: TuneContext, + candidates: List[MeasureCandidate], + results: List[RunnerResult] = None, + extractor: FeatureExtractor = PerStoreFeature(extract_workload=True), +): + """Extract feature vectors and compute mean costs. + + Parameters + ---------- + context: TuneContext + The tuning context. + candidates: List[MeasureCandidate] + The measure candidates. + results: List[RunnerResult] + The measured results, can be None if used in prediction. + extractor: FeatureExtractor + The feature extractor. + + Returns + ------- + new_features: List[np.ndarray] + The extracted features. + new_mean_costs: np.ndarray + The mean costs. + """ + + def _feature(feature: NDArray) -> np.ndarray: + return feature.numpy().astype("float32") + + def _mean_cost(res: RunnerResult) -> float: + if not res.run_secs: + return 1e10 + return float(np.median([float(s) for s in res.run_secs])) + + new_features = [_feature(x) for x in extractor.extract_from(context, candidates)] + new_mean_costs = ( + np.array([_mean_cost(x) for x in results]).astype("float32") + if results is not None + else None + ) + return new_features, new_mean_costs + + +class State: + """State of the trainer + + Parameters + ---------- + model: SegmentSumMLP + The cost model. + data: Dict[str, FeatureGroup] + The data groups. + data_size: int + The size of all data. + untrained_size: int + The size of the untrained data. + """ + + model: SegmentSumMLP + data: Dict[str, FeatureGroup] + data_size: int + untrained_size: int + + def __init__( + self, + model_config: SegmentSumMLPConfig = SegmentSumMLPConfig(), + extractor: FeatureExtractor = PerStoreFeature(extract_workload=True), + ): + self.model = SegmentSumMLP(**model_config.to_dict()) + self.data = OrderedDict() + self.data_size = 0 + self.untrained_size = 0 + self.extractor = extractor + + def load( # pylint: disable=too-many-locals + self, + path: str, + target: str = "nvidia/nvidia-v100", + ) -> None: + """Load the cached model, cached features, or raw data. + + Parameters + ---------- + path: str + The path to the tar file containing cached model, cached features, + or raw data. + target: str + The target for the tuning context. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model_path = os.path.join(tmp_dir, "model.pth") + cache_path = os.path.join(tmp_dir, "cached_data.npy") + raw_path = os.path.join(tmp_dir, "raw_data") + untar(path, tmp_dir) + if os.path.exists(model_path): + self.model.load_state_dict(torch.load(model_path)) + if os.path.exists(cache_path): + for group_hash, features, costs in np.load(cache_path, allow_pickle=True): + self.data[group_hash] = FeatureGroup( + group_hash=group_hash, + features=list(features), + costs=costs, + ) + self.data_size += len(costs) + self.untrained_size += len(costs) + elif os.path.exists(raw_path): + from tqdm import tqdm # type: ignore + + model_dirs = glob.glob(os.path.join(raw_path, "*")) + workload_paths = [] + for model_dir in model_dirs: + json_files = glob.glob(os.path.join(model_dir, "*.json")) + for json_file in json_files: + if json_file.endswith("_workload.json"): + workload_paths.append(json_file) + for workload_path in tqdm(workload_paths): + try: + database = JSONDatabase( + path_workload=workload_path, + path_tuning_record=workload_path.replace( + "_workload.json", "_candidates.json" + ), + ) + except tvm._ffi.base.TVMError: # pylint: disable=protected-access + continue + candidates, results = [], [] + tuning_records = database.get_all_tuning_records() + if len(tuning_records) == 0: + continue + for record in tuning_records: + candidates.append(record.as_measure_candidate()) + results.append(RunnerResult(run_secs=record.run_secs, error_msg=None)) + assert len(candidates) == len(results) + context = TuneContext(mod=tuning_records[0].workload.mod, target=Target(target)) + features, mean_costs = extract_features( + context, candidates, results, self.extractor + ) + self.add_to_group(features, mean_costs, shash2hex(context.mod)) + + def save(self, path: str) -> None: + """Cache the model and data. + + Parameters + ---------- + path: str + The path to the cached tar file. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model_path = os.path.join(tmp_dir, "model.pth") + cache_path = os.path.join(tmp_dir, "cached_data.npy") + torch.save(self.model.state_dict(), model_path) + data = [ + ( + g.group_hash, + g.features, + g.costs, + ) + for g in self.data.values() + ] + np.save( + file=cache_path, + arr=np.array(data, dtype=object), + ) + tar(path, [x for x in [model_path, cache_path] if x is not None]) + logger.info("Saved MLPModel to %s", path) + + def add_to_group( + self, + features: List[np.ndarray], + costs: np.ndarray, + group_hash: str, + ): + """Add features and costs to the data groups with key group_hash. + + Parameters + ---------- + features: List[np.ndarray] + The feature vectors. + costs: np.ndarray + The measured results. + group_hash: str + The structural hash of the candidates. + """ + group = self.data.get(group_hash, None) + if group is None: + group = FeatureGroup( + group_hash=group_hash, + features=features, + costs=costs, + ) + else: + group.append(features, costs) + self.data[group_hash] = group + self.data_size += len(features) + self.untrained_size += len(features) + + +class SegmentSumMLPTrainer: + """The trainer for Segment Sum MLP model. + + Parameters + ---------- + state: State + The state of the trainer. + batch_size : int + The batch size. + learning rate : float + The learning rate. + weight decay : float + The weight decay. + num_epoch_full : int + The number of epochs used in full training. + num_epoch_incremental : int + The number of epochs used in incremental training. + grad_clip_norm: float + The norm of gradient clipping. + train_verbose: int + The verbose frequency for training in batches. + test_interval: int + The testing interval in epochs. + test_split: float + The fraction of data for testing. + frozen: bool + Determine whether to re-train the model or not. + optimizer: "torch.optim.adam.Adam" + The optimizer. + scheduler: "torch.optim.lr_scheduler.StepLR" + The scheduler. + """ + + state: State + batch_size: int = 128 + learning_rate: float = 7e-4 + weight_decay: float = 1e-6 + num_epoch_full: int = 50 + num_epoch_incremental: int = 5 + grad_clip_norm: float = 0.5 + train_verbose: int = 1000 + test_interval: int = 1 + test_split: float = 0.2 + frozen: bool = False + optimizer: "torch.optim.adam.Adam" # type: ignore + scheduler: "torch.optim.lr_scheduler.StepLR" # type: ignore + + def __init__( + self, + train_config: TrainerConfig = TrainerConfig(), + state: State = State(), + ): + config = train_config.to_dict() + for attr in config: + setattr(self, attr, config[attr]) + self.state = state + self.device = "cuda" if torch.cuda.device_count() else "cpu" + self.optimizer, self.scheduler = None, None + + def train_step( + self, + data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"], + batch: int = 0, + train_loss: float = None, + ) -> float: + """Helper function for training on a single batch. + + Parameters + ---------- + data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"] + A batch of data, should be a tuple of (segment_sizes, features, gt_results). + batch: int = 0 + The current batch number. + train_loss: float = None + The previous averaged training loss, None if it is the first batch. + + Returns + ------- + train_loss: float + The averaged training loss after the current batch. + """ + segment_sizes, features, gt_results = ( + data[0].to(self.device), + data[1].to(self.device), + data[2].to(self.device), + ) + self.optimizer.zero_grad() + pred_results = self.state.model(segment_sizes, features) + loss = lambda_rank_loss(pred_results, gt_results) + loss.backward() + torch.nn.utils.clip_grad_norm_(self.state.model.parameters(), self.grad_clip_norm) + self.optimizer.step() + loss = loss.detach().cpu() + train_loss = ( + train_loss * 0.95 + loss.item() * 0.05 if train_loss is not None else loss.item() + ) + segment_sizes, features, gt_results, pred_results = ( + segment_sizes.detach().cpu(), + features.detach().cpu(), + gt_results.detach().cpu(), + pred_results.detach().cpu(), + ) + if batch % self.train_verbose == 0: + logger.info("Batch: %d, train loss: %6f", batch, train_loss) + return train_loss + + def predict_step( + self, + data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"], + ): + """Helper function for predicting (validating) on a single batch. + + Parameters + ---------- + data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"] + A batch of data, should be a tuple of (segment_sizes, features, gt_results). + gt_results can be None if it is used for predicting. + + Returns + ------- + pred_results: np.ndarray + The predicted results for the current batch. + test_loss_batch: float + If used for validation, return the test loss for the current batch. + test_scores_batch: List[float] + If used for validation, return the topk scores for the current batch. + """ + test_loss_batch, test_scores_batch = None, [] + segment_sizes, features = ( + data[0].to(self.device), + data[1].to(self.device), + ) + gt_results = data[2] + pred_results = self.state.model(segment_sizes, features) + segment_sizes, features, pred_results = ( + segment_sizes.detach().cpu(), + features.detach().cpu(), + pred_results.detach().cpu(), + ) + if gt_results is not None: + test_loss_batch = lambda_rank_loss(pred_results, gt_results).item() + for k in [1, 5, 10]: + test_scores_batch.append(topk_score(pred_results, gt_results, k)) + return pred_results.numpy(), test_loss_batch, test_scores_batch + + def train_full(self): # pylint: disable=too-many-locals + """Training on the full dataset.""" + # split into training and testing set + keys = list(self.state.data.keys()) + test_keys = random.sample(keys, k=math.floor(len(keys) * self.test_split)) + train_data = OrderedDict() + test_data = OrderedDict() + for key in keys: + if key in test_keys: + test_data[key] = self.state.data[key] + else: + train_data[key] = self.state.data[key] + train_features = list( + itertools_chain.from_iterable([g.features for g in train_data.values()]) + ) + test_features = list( + itertools_chain.from_iterable([g.features for g in test_data.values()]) + ) + train_results = np.concatenate([g.min_cost / g.costs for g in train_data.values()]) + test_results = np.concatenate([g.min_cost / g.costs for g in test_data.values()]) + train_loader = SegmentDataLoader( + train_features, train_results, batch_size=self.batch_size, shuffle=True + ) + test_loader = SegmentDataLoader( + test_features, test_results, batch_size=self.batch_size, shuffle=False + ) + self.optimizer = torch.optim.Adam( + self.state.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay + ) + self.scheduler = torch.optim.lr_scheduler.StepLR( + self.optimizer, + step_size=self.num_epoch_full // 10, + gamma=0.8, + verbose=True, + ) + self.state.model = self.state.model.to(self.device) + min_test_loss = 1e10 + logger.info("Training size: %d; Testing size: %d", len(train_loader), len(test_loader)) + + model_cache_path = tempfile.NamedTemporaryFile().name # pylint: disable=consider-using-with + for epoch in range(self.num_epoch_full): + logger.info("Epoch: %d", epoch) + # training + self.state.model.train() + train_loss = None + for batch, data in enumerate(train_loader): + train_loss = self.train_step(data, batch, train_loss) + self.scheduler.step() + # testing + if epoch % self.test_interval == 0: + self.state.model.eval() + test_losses, test_scores = [], [] + for data in test_loader: + _, test_loss_batch, test_scores_batch = self.predict_step(data) + test_losses.append(test_loss_batch) + test_scores.append(test_scores_batch) + test_loss = ( + np.array(test_losses[:-1]).mean() if len(test_losses) > 1 else test_losses[0] + ) + logger.info( + "Average test loss: %6f, top1 score: %5f, top5 score: %5f, top10 score: %5f", + test_loss, + np.array(test_scores)[:, 0].mean(), + np.array(test_scores)[:, 1].mean(), + np.array(test_scores)[:, 2].mean(), + ) + if test_loss < min_test_loss: + min_test_loss = test_loss + torch.save(self.state.model.state_dict(), model_cache_path) + self.state.model.to("cpu").load_state_dict(torch.load(model_cache_path)) + self.state.untrained_size = 0 + + def train_incremental( + self, + features: List[np.ndarray], + results: np.ndarray, + ): + """Training on incremental data. + + Parameters + ---------- + features: List[np.ndarray] + The extracted features. + results: np.ndarray + The measured results. + """ + results = np.min(results) / results + loader = SegmentDataLoader(features, results, batch_size=self.batch_size, shuffle=True) + self.optimizer = torch.optim.Adam( + self.state.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay + ) + self.state.model = self.state.model.to(self.device) + logger.info("Incremental training size: %d", len(loader)) + for epoch in range(self.num_epoch_incremental): + logger.info("Epoch: %d", epoch) + self.state.model.train() + loss = None + for batch, data in enumerate(loader): + loss = self.train_step(data, batch, loss) + self.state.model.to("cpu") + self.state.untrained_size = max(0, self.state.untrained_size - len(loader)) + + def predict_incremental( + self, + features: List[np.ndarray], + results: np.ndarray = None, + ) -> np.ndarray: + """Predicting (validating) on incremental data. + + Parameters + ---------- + features: List[np.ndarray] + The extracted features. + results: np.ndarray + The measured results, can be None if used for predicting. + + Returns + ------- + pred_results: np.ndarray + The predicted results. + """ + if results is not None: + results = np.min(results) / results + loader = SegmentDataLoader(features, results, batch_size=self.batch_size, shuffle=False) + self.state.model = self.state.model.to(self.device).eval() + logger.info("Incremental testing size: %d", len(loader)) + pred_results, losses, scores = [], [], [] + for data in loader: + pred_results_batch, losses_batch, scores_batch = self.predict_step(data) + pred_results.append(pred_results_batch) + losses.append(losses_batch) + scores.append(scores_batch) + pred_results = np.concatenate(pred_results) + if results is not None: + losses = np.array(losses[:-1]).mean() if len(losses) > 1 else losses[0] + logger.info( + "Average test loss: %6f, top1 score: %5f, top5 score: %5f, top10 score: %5f", + losses, + np.array(scores)[:, 0].mean(), + np.array(scores)[:, 1].mean(), + np.array(scores)[:, 2].mean(), + ) + return pred_results + + def update( + self, + features: List[np.ndarray], + costs: np.ndarray, + group_hash: str, + ): + """Update the dataset and re-train the model if not frozen. + + Parameters + ---------- + features: List[np.ndarray] + The extracted features. + costs: np.ndarray + The measured results. + group_hash: str + The hash of the group. + """ + self.state.add_to_group(features, costs, group_hash) + if not self.frozen: + self.predict_incremental(features, costs) + if self.state.untrained_size / self.state.data_size > 0.2: + self.train_full() + else: + self.train_incremental(features, costs) + + +@derived_object +class MLPModel(PyCostModel): + """Segment Sum MLP Model + + Parameters + ---------- + trainer: SegmentSumMLPTrainer + The trainer for the model, handling the training interface. + """ + + trainer: SegmentSumMLPTrainer + + def __init__( + self, + *, + trainer: SegmentSumMLPTrainer = SegmentSumMLPTrainer(), + ): + super().__init__() + self.trainer = trainer + + def load(self, path: str) -> None: + """Load the cost model, cached data or raw data from given file location. + + Parameters + ---------- + path : str + The file path. + """ + self.trainer.state.load(path) + + def save(self, path: str) -> None: + """Save the cost model and data to given file location. + + Parameters + ---------- + path : str + The file path. + """ + self.trainer.state.save(path) + + def update( + self, + context: TuneContext, + candidates: List[MeasureCandidate], + results: List[RunnerResult], + ) -> None: + """Update the dataset, re-train the cost model if not frozen. + + Parameters + ---------- + context : TuneContext, + The tuning context. + candidates : List[MeasureCandidate] + The measure candidates. + results : List[RunnerResult] + The running results of the measure candidates. + """ + features, mean_costs = extract_features( + context, candidates, results, self.trainer.state.extractor + ) + self.trainer.update(features, mean_costs, shash2hex(context.mod)) + + def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray: + """Predict given the measure candidates. + + Parameters + ---------- + context : TuneContext, + The tuning context. + candidates : List[MeasureCandidate] + The measure candidates. + + Return + ------ + result : np.ndarray + The predicted normalized score. + """ + features, _ = extract_features(context, candidates, None, self.trainer.state.extractor) + pred_results = self.trainer.predict_incremental(features) + return pred_results diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index a55ffa8b283af..f8fb64e924077 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -203,7 +203,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record, } catch (std::runtime_error& e) { LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1) << " of file " << path_tuning_record << ". The workload is:\n" - << (workload.defined() ? tir::AsTVMScript(workload) : "(null)") + << (workload.defined() ? tir::AsTVMScript(workload->mod) : "(null)") << "\nThe JSONObject of TuningRecord is:\n" << json_obj << "\nThe error message is:\n" << e.what(); From c4dc41a0dde6ae3118823736c325811f15994615 Mon Sep 17 00:00:00 2001 From: Rafael Stahl Date: Mon, 11 Jul 2022 19:06:20 +0200 Subject: [PATCH 084/111] [Frontend][TFLite] PreLU alpha can be an expr (#11879) * [Frontend][TFLite] PreLU alpha can be an expr * [Frontend][TFLite] handle both cases of PreLU alpha param --- python/tvm/relay/frontend/tflite.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index d7ec441e0eb40..c8352a9949e87 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -3010,11 +3010,14 @@ def convert_prelu(self, op): input_tensor = input_tensors[0] alpha_tensor = input_tensors[1] - alpha_tensor_type = alpha_tensor.tensor.Type() - alpha_tensor_type_str = self.get_tensor_type_str(alpha_tensor_type) - alpha_expr = self.exp_tab.new_const( - self.get_tensor_value(alpha_tensor), dtype=alpha_tensor_type_str - ) + if self.has_expr(alpha_tensor.tensor_idx): + alpha_expr = self.get_expr(alpha_tensor.tensor_idx) + else: + alpha_tensor_type = alpha_tensor.tensor.Type() + alpha_tensor_type_str = self.get_tensor_type_str(alpha_tensor_type) + alpha_expr = self.exp_tab.new_const( + self.get_tensor_value(alpha_tensor), dtype=alpha_tensor_type_str + ) in_expr = self.get_expr(input_tensor.tensor_idx) data_shape = to_int_list(self.get_tensor_shape(input_tensor)) From 04db26e8d9491580bbbb353a66b8225198c75209 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Mon, 11 Jul 2022 11:37:11 -0700 Subject: [PATCH 085/111] [microtvm][RVM] Refactor Arduino/Zephyr into one RVM (#12023) --- apps/microtvm/reference-vm/.gitignore | 3 +- apps/microtvm/reference-vm/README.md | 27 +++--- .../reference-vm/{arduino => }/Vagrantfile | 15 ++- apps/microtvm/reference-vm/arduino/.gitignore | 1 - apps/microtvm/reference-vm/arduino/README.md | 46 --------- .../arduino/base-box/base_box_setup.sh | 55 ----------- .../reference-vm/arduino/provision_setup.sh | 42 -------- apps/microtvm/reference-vm/base-box-tool.py | 74 ++++++--------- .../{arduino => }/base-box/.gitignore | 0 .../base-box/Vagrantfile.packer-template | 0 .../base-box/base_box_provision.sh | 16 +++- .../{zephyr => }/base-box/base_box_setup.sh | 0 .../{ => base-box}/base_box_setup_common.sh | 0 .../{arduino => }/base-box/base_box_test.sh | 26 ++--- .../{zephyr => }/provision_setup.sh | 9 +- .../{rebuild-tvm.sh => rebuild_tvm.sh} | 11 +-- .../scripts/reference_vm_build.sh | 13 +-- .../scripts/reference_vm_release.sh | 13 +-- apps/microtvm/reference-vm/zephyr/.gitignore | 1 - apps/microtvm/reference-vm/zephyr/README.md | 30 ------ apps/microtvm/reference-vm/zephyr/Vagrantfile | 95 ------------------- .../reference-vm/zephyr/base-box/.gitignore | 4 - .../base-box/Vagrantfile.packer-template | 47 --------- .../zephyr/base-box/base_box_provision.sh | 37 -------- .../zephyr/base-box/base_box_test.sh | 33 ------- tests/lint/check_file_type.py | 6 +- 26 files changed, 99 insertions(+), 505 deletions(-) rename apps/microtvm/reference-vm/{arduino => }/Vagrantfile (87%) delete mode 100644 apps/microtvm/reference-vm/arduino/.gitignore delete mode 100644 apps/microtvm/reference-vm/arduino/README.md delete mode 100644 apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh delete mode 100644 apps/microtvm/reference-vm/arduino/provision_setup.sh rename apps/microtvm/reference-vm/{arduino => }/base-box/.gitignore (100%) rename apps/microtvm/reference-vm/{arduino => }/base-box/Vagrantfile.packer-template (100%) rename apps/microtvm/reference-vm/{arduino => }/base-box/base_box_provision.sh (94%) mode change 100644 => 100755 rename apps/microtvm/reference-vm/{zephyr => }/base-box/base_box_setup.sh (100%) mode change 100644 => 100755 rename apps/microtvm/reference-vm/{ => base-box}/base_box_setup_common.sh (100%) rename apps/microtvm/reference-vm/{arduino => }/base-box/base_box_test.sh (60%) rename apps/microtvm/reference-vm/{zephyr => }/provision_setup.sh (78%) mode change 100644 => 100755 rename apps/microtvm/reference-vm/{rebuild-tvm.sh => rebuild_tvm.sh} (84%) delete mode 100644 apps/microtvm/reference-vm/zephyr/.gitignore delete mode 100644 apps/microtvm/reference-vm/zephyr/README.md delete mode 100644 apps/microtvm/reference-vm/zephyr/Vagrantfile delete mode 100644 apps/microtvm/reference-vm/zephyr/base-box/.gitignore delete mode 100644 apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template delete mode 100644 apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh delete mode 100755 apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh diff --git a/apps/microtvm/reference-vm/.gitignore b/apps/microtvm/reference-vm/.gitignore index d918f5e13cc50..187e6d9f34da1 100644 --- a/apps/microtvm/reference-vm/.gitignore +++ b/apps/microtvm/reference-vm/.gitignore @@ -1 +1,2 @@ -/release-test \ No newline at end of file +/release-test +/.vagrant diff --git a/apps/microtvm/reference-vm/README.md b/apps/microtvm/reference-vm/README.md index a5bb63574ce35..3d419cd364631 100644 --- a/apps/microtvm/reference-vm/README.md +++ b/apps/microtvm/reference-vm/README.md @@ -29,9 +29,10 @@ For more information on how to use them, see the ## microTVM Developer Information Each RTOS or platform (like Zephyr, Ardunio, etc) that integrates with microTVM -can check-in a Reference VM in this directory to help the community collaborate. -You should use the tools provided here to ensure a uniform release process -across all platforms. Typically, releases need to be created by TVM committers. +can check-in installation scripts in the Reference VM in this directory to help +the community collaborate. You should use the tools provided here to ensure a +uniform release process across all platforms. Typically, releases need to be +created by TVM committers. Generally speaking, it's expected that any integrated platform with a regression test checked-in to the tvm repository should also define a reference VM. If you @@ -39,16 +40,13 @@ want to integrate a new platform, please raise a discussion on [the forum](https://discuss.tvm.ai). -## Reference VMs Organization +## Reference VM Organization -Reference VMs are organized in this directory as follows: +The Reference VM is organized in this directory as follows: ``` . +-- base-box-tool.py - Reference VM build, test, and release tool. -+-- PLATFORM/ - One or more dirs related to the supported platform(s), - like zephyr/ and arduino/. The dir names are the same to - be passed as arguments to base-box-tool.py as PLATFORM. +-- Vagrantfile - Vagrantfile that end-users will invoke. Should be based | off a base box which contains dependencies other than the | TVM python dependencies. @@ -64,12 +62,12 @@ Reference VMs are organized in this directory as follows: 1. **Build** the base box for a given platform: ```bash -$ ./base-box-tool.py [--provider=PROVIDER] build PLATFORM +$ ./base-box-tool.py [--provider=PROVIDER] build ``` For example: ```bash -$ ./base-box-tool.py --provider virtualbox build zephyr +$ ./base-box-tool.py --provider virtualbox build ``` 2. **Run** release tests for each platform: @@ -90,7 +88,7 @@ $ ./base-box-tool.py --provider virtualbox build zephyr This command does the following for the specified provider: - * Copies all files inside `PLATFORM/` dir except `.vagrant` and `base-box` to + * Copies all files inside this dir except `.vagrant` and `base-box` to `release-test/`. This is done to avoid reusing any VM the developer may have started; @@ -108,7 +106,12 @@ $ ./base-box-tool.py --provider virtualbox build zephyr 4. If release tests pass, **release** the box: ```bash -$ ./base-box-tool.py [--provider=PROVIDER] release --release-version=RELEASE_VER --platform-version=PLATFORM_VER PLATFORM +$ ./base-box-tool.py [--provider=PROVIDER] release --release-version=RELEASE_VER ``` For that step be sure you've logged in to Vagrant Cloud using the `vagrant` tool. + +## Versioning +We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Updates to the Zephyr SDK or Arduino board SDKs are considered major changes and require incrementing major version `X`. In this versioning, `Z` is barely used but we kept it since Vagrant requires this format. + +**Note**: We will release all microTVM RVM boxes under [microtvm](https://app.vagrantup.com/tlcpack/boxes/microtvm) and use box versioning in Vagrant file. Previous versions like `microtvm-zephyr`, `microtvm-arduino`, `microtvm-zephyr-2.5`, etc. are deprecated and will be removed in the future. diff --git a/apps/microtvm/reference-vm/arduino/Vagrantfile b/apps/microtvm/reference-vm/Vagrantfile similarity index 87% rename from apps/microtvm/reference-vm/arduino/Vagrantfile rename to apps/microtvm/reference-vm/Vagrantfile index ab746c17ee2b3..00465a8b88485 100644 --- a/apps/microtvm/reference-vm/arduino/Vagrantfile +++ b/apps/microtvm/reference-vm/Vagrantfile @@ -16,8 +16,8 @@ # under the License. Vagrant.configure("2") do |config| - config.vm.box = "tlcpack/microtvm-arduino" - config.vm.box_version = "2.0.0" + config.vm.box = "tlcpack/microtvm" + config.vm.box_version = "1.0.0" if ENV.has_key?("TVM_RVM_NUM_CORES") num_cores = ENV["TVM_RVM_NUM_CORES"] @@ -31,7 +31,7 @@ Vagrant.configure("2") do |config| ram_bytes = 2048 end - tvm_home = "../../../.." + tvm_home = "../../.." dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())] if ENV.has_key?("TVM_PROJECT_DIR") then dirs_to_mount.append(ENV["TVM_PROJECT_DIR"]) @@ -47,10 +47,15 @@ Vagrant.configure("2") do |config| end end - config.vm.provision "shell", path: "provision_setup.sh", env: {"TVM_HOME": dirs_to_mount[0]}, privileged: false + config.vm.provision "shell", + path: "provision_setup.sh", + env: {"TVM_HOME": dirs_to_mount[0], + "TVM_CI_NUM_CORES": num_cores + }, + privileged: false # Enable USB Controller on VirtualBox - vm_name = "microtvm-arduino-#{Time.now.tv_sec}" + vm_name = "microtvm-#{Time.now.tv_sec}" config.vm.provider "virtualbox" do |vb, overrides| vb.name = vm_name vb.cpus = num_cores diff --git a/apps/microtvm/reference-vm/arduino/.gitignore b/apps/microtvm/reference-vm/arduino/.gitignore deleted file mode 100644 index dace7081e3f20..0000000000000 --- a/apps/microtvm/reference-vm/arduino/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/.vagrant diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md deleted file mode 100644 index 530da71a58f32..0000000000000 --- a/apps/microtvm/reference-vm/arduino/README.md +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - - - - - - - -# microTVM Arduino Reference Virtual Machine - -This directory contains setup files for Arduino virtual machine used for testing -microTVM platforms that are supported by [Arduino](https://www.arduino.cc/). - -## VM Information for Developers -Arduino VM is published under [tlcpack](https://app.vagrantup.com/tlcpack). -Here is a list of different release versions and their tools. - -We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Changing any Arduino board SDKs is considered a major change and requires increasing `X`. - -## Supported Arduino Boards -This RVM has been tested and is known to work with these boards: -- Adafruit Metro M4 -- Adafruit Pybadge -- Arduino Due -- Arduino Nano 33 BLE -- Arduino Portenta H7 -- Feather S2 -- Raspberry Pi Pico -- Sony Spresense -- Wio Terminal - -However, the RVM *should* work with any Arduino with sufficient memory, provided -its core is installed in `base-box/base_box_provision.sh`. - -Note that this RVM does not work with the Teensy boards, even though they are -supported by microTVM. This is because arduino-cli does not support Teensy -boards (https://github.com/arduino/arduino-cli/issues/700)/). diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh deleted file mode 100644 index 8ce9a5a0fa287..0000000000000 --- a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -e -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -x - -skip_zeroing_disk=0 -if [ -e "$HOME/skip_zeroing_disk" ]; then - echo "NOTE: will not zero disk at the end due to VMWare Fusion bug" - echo "See: https://communities.vmware.com/t5/VMware-Fusion-Discussions/VMWare-Fusion-Pro-11-15-6-16696540-causes-macOS-crash-during/m-p/2284011#M139190" - skip_zeroing_disk=1 -fi - -# Install common configs -~/base_box_setup_common.sh -rm -f ~/base_box_setup_common.sh - -# Poetry -sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc -sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc - -# TODO do we need this? -echo 'export PATH=$HOME/vagrant/bin:"$PATH"' >> ~/.profile -source ~/.profile -echo PATH=$PATH - -# Clean box for packaging as a base box -sudo apt-get clean -if [ $skip_zeroing_disk -eq 0 ]; then - echo "Zeroing disk..." - EMPTY_FILE="$HOME/EMPTY" - dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true - if [ ! -e "${EMPTY_FILE}" ]; then - echo "failed to zero empty sectors on disk" - exit 2 - fi - rm -f "${EMPTY_FILE}" -else - echo "NOTE: skipping zeroing disk due to command-line argument." -fi diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh deleted file mode 100644 index 1d54db17fae56..0000000000000 --- a/apps/microtvm/reference-vm/arduino/provision_setup.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -e -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -# TVM -# NOTE: TVM is presumed to be mounted already by Vagrantfile. -cd "${TVM_HOME}" - -platform="arduino" -apps/microtvm/reference-vm/rebuild-tvm.sh ${platform} - -# Build poetry -cd apps/microtvm/reference-vm/arduino - -poetry env use 3.7 - -# importers -poetry install -E importer-onnx -poetry install -E importer-tflite -poetry install -E importer-mxnet - -poetry install - -echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile -echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/arduino && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile -echo "source \$VENV_PATH/bin/activate" >>~/.profile diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py index 4a1b5aea3fecd..325b9bc0c4c99 100755 --- a/apps/microtvm/reference-vm/base-box-tool.py +++ b/apps/microtvm/reference-vm/base-box-tool.py @@ -18,7 +18,6 @@ import argparse -from ast import arg import copy import json import logging @@ -34,7 +33,7 @@ _LOG = logging.getLogger(__name__) -THIS_DIR = os.path.realpath(os.path.dirname(__file__) or ".") +THIS_DIR = pathlib.Path(os.path.realpath(os.path.dirname(__file__))) # List of vagrant providers supported by this tool ALL_PROVIDERS = ( @@ -52,31 +51,26 @@ # Extra scripts required to execute on provisioning # in [platform]/base-box/base_box_provision.sh -COMMON_SCRIPTS = [ - "apps/microtvm/reference-vm/base_box_setup_common.sh", +EXTRA_SCRIPTS = [ + "apps/microtvm/reference-vm/base-box/base_box_setup_common.sh", "docker/install/ubuntu_install_core.sh", "docker/install/ubuntu_install_python.sh", "docker/utils/apt-install-and-clear.sh", "docker/install/ubuntu1804_install_llvm.sh", + # Zephyr + "docker/install/ubuntu_init_zephyr_project.sh", + "docker/install/ubuntu_install_zephyr_sdk.sh", + "docker/install/ubuntu_install_cmsis.sh", ] -EXTRA_SCRIPTS = { - "arduino": [], - "zephyr": [ - "docker/install/ubuntu_init_zephyr_project.sh", - "docker/install/ubuntu_install_zephyr_sdk.sh", - "docker/install/ubuntu_install_cmsis.sh", - ], -} - PACKER_FILE_NAME = "packer.json" # List of identifying strings for microTVM boards for testing. -with open(pathlib.Path(THIS_DIR) / ".." / "zephyr" / "template_project" / "boards.json") as f: +with open(THIS_DIR / ".." / "zephyr" / "template_project" / "boards.json") as f: zephyr_boards = json.load(f) -with open(pathlib.Path(THIS_DIR) / ".." / "arduino" / "template_project" / "boards.json") as f: +with open(THIS_DIR / ".." / "arduino" / "template_project" / "boards.json") as f: arduino_boards = json.load(f) ALL_MICROTVM_BOARDS = { @@ -232,7 +226,7 @@ def attach_vmware(uuid, vid_hex=None, pid_hex=None, serial=None): } -def generate_packer_config(platform, file_path, providers): +def generate_packer_config(file_path, providers): builders = [] provisioners = [] for provider_name in providers: @@ -253,7 +247,7 @@ def generate_packer_config(platform, file_path, providers): ["git", "rev-parse", "--show-toplevel"], encoding="utf-8" ).strip() - scripts_to_copy = COMMON_SCRIPTS + EXTRA_SCRIPTS[platform] + scripts_to_copy = EXTRA_SCRIPTS for script in scripts_to_copy: script_path = os.path.join(repo_root, script) filename = os.path.basename(script_path) @@ -285,11 +279,9 @@ def generate_packer_config(platform, file_path, providers): def build_command(args): - this_dir = pathlib.Path(THIS_DIR) - base_box_dir = this_dir / args.platform / "base-box" + base_box_dir = THIS_DIR / "base-box" generate_packer_config( - args.platform, os.path.join(base_box_dir, PACKER_FILE_NAME), args.provider or ALL_PROVIDERS, ) @@ -313,9 +305,7 @@ def build_command(args): if box_package_exists: sys.exit("One or more box packages exist (see list above). To rebuild use '--force'") - subprocess.check_call( - packer_args, cwd=os.path.join(THIS_DIR, args.platform, "base-box"), env=env - ) + subprocess.check_call(packer_args, cwd=THIS_DIR / "base-box", env=env) REQUIRED_TEST_CONFIG_KEYS = { @@ -325,10 +315,10 @@ def build_command(args): VM_BOX_RE = re.compile(r'(.*\.vm\.box) = "(.*)"') - +VM_TVM_HOME_RE = re.compile(r'(.*tvm_home) = "(.*)"') # Paths, relative to the platform box directory, which will not be copied to release-test dir. -SKIP_COPY_PATHS = [".vagrant", "base-box"] +SKIP_COPY_PATHS = [".vagrant", "base-box", "scripts"] def do_build_release_test_vm( @@ -365,6 +355,12 @@ def do_build_release_test_vm( if "config.vm.box_version" in line: continue m = VM_BOX_RE.match(line) + tvm_home_m = VM_TVM_HOME_RE.match(line) + + if tvm_home_m: + # Adjust tvm home for testing step + f.write(f'{tvm_home_m.group(1)} = "../../../.."\n') + continue if not m: f.write(line) continue @@ -391,7 +387,7 @@ def do_build_release_test_vm( return True -def do_run_release_test(release_test_dir, platform, provider_name, test_config, test_device_serial): +def do_run_release_test(release_test_dir, provider_name, test_config, test_device_serial): with open( os.path.join(release_test_dir, ".vagrant", "machines", "default", provider_name, "id") ) as f: @@ -405,7 +401,7 @@ def do_run_release_test(release_test_dir, platform, provider_name, test_config, pid_hex=test_config["pid_hex"], serial=test_device_serial, ) - tvm_home = os.path.realpath(os.path.join(THIS_DIR, "..", "..", "..")) + tvm_home = os.path.realpath(THIS_DIR / ".." / ".." / "..") def _quote_cmd(cmd): return " ".join(shlex.quote(a) for a in cmd) @@ -415,7 +411,7 @@ def _quote_cmd(cmd): + " && " + _quote_cmd( [ - f"apps/microtvm/reference-vm/{platform}/base-box/base_box_test.sh", + f"apps/microtvm/reference-vm/base-box/base_box_test.sh", test_config["microtvm_board"], ] ) @@ -424,9 +420,9 @@ def _quote_cmd(cmd): def test_command(args): - user_box_dir = pathlib.Path(THIS_DIR) / args.platform + user_box_dir = THIS_DIR base_box_dir = user_box_dir / "base-box" - boards_file = pathlib.Path(THIS_DIR) / ".." / args.platform / "template_project" / "boards.json" + boards_file = THIS_DIR / ".." / args.platform / "template_project" / "boards.json" with open(boards_file) as f: test_config = json.load(f) @@ -444,7 +440,7 @@ def test_command(args): providers = args.provider - release_test_dir = os.path.join(THIS_DIR, f"release-test-{args.platform}") + release_test_dir = THIS_DIR / f"release-test" if args.skip_build or args.skip_destroy: assert ( @@ -460,7 +456,6 @@ def test_command(args): ) do_run_release_test( release_test_dir, - args.platform, provider_name, microtvm_test_config, args.test_device_serial, @@ -492,7 +487,7 @@ def release_command(args): if args.release_full_name: vm_name = args.release_full_name else: - vm_name = f"tlcpack/microtvm-{args.platform}" + vm_name = "tlcpack/microtvm" if not args.skip_creating_release_version: subprocess.check_call( @@ -518,12 +513,7 @@ def release_command(args): vm_name, args.release_version, provider_name, - os.path.join( - THIS_DIR, - args.platform, - "base-box", - f"output-packer-{provider_name}/package.box", - ), + str(THIS_DIR / "base-box" / f"output-packer-{provider_name}/package.box"), ] ) @@ -550,7 +540,6 @@ def parse_args(): # Options for build subcommand parser_build = subparsers.add_parser("build", help="Build a base box.") parser_build.set_defaults(func=build_command) - parser_build.add_argument("platform", help=platform_help_str, choices=ALL_PLATFORMS) parser_build.add_argument( "--debug-packer", action="store_true", @@ -606,7 +595,6 @@ def parse_args(): # Options for release subcommand parser_release = subparsers.add_parser("release", help="Release base box to cloud.") parser_release.set_defaults(func=release_command) - parser_release.add_argument("platform", help=platform_help_str, choices=ALL_PLATFORMS) parser_release.add_argument( "--release-version", required=True, @@ -634,10 +622,6 @@ def parse_args(): def main(): args = parse_args() - - if os.path.sep in args.platform or not os.path.isdir(os.path.join(THIS_DIR, args.platform)): - sys.exit(f" must be a sub-direcotry of {THIS_DIR}; got {args.platform}") - args.func(args) diff --git a/apps/microtvm/reference-vm/arduino/base-box/.gitignore b/apps/microtvm/reference-vm/base-box/.gitignore similarity index 100% rename from apps/microtvm/reference-vm/arduino/base-box/.gitignore rename to apps/microtvm/reference-vm/base-box/.gitignore diff --git a/apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template similarity index 100% rename from apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template rename to apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/base-box/base_box_provision.sh old mode 100644 new mode 100755 similarity index 94% rename from apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh rename to apps/microtvm/reference-vm/base-box/base_box_provision.sh index 4d845d7fed0ec..175e4787eb909 --- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh +++ b/apps/microtvm/reference-vm/base-box/base_box_provision.sh @@ -20,12 +20,23 @@ # virtual machine similar to CI QEMU setup. # -set -e set -x source ~/.profile +# Init Zephyr +cd ~ +~/ubuntu_init_zephyr_project.sh ~/zephyr + +# Install CMSIS +cd ~ +~/ubuntu_install_cmsis.sh ~/cmsis + +# Cleanup +rm -f ubuntu_init_zephyr_project.sh ubuntu_install_cmsis.sh + # Init Arduino +source ~/.profile cd ~ sudo apt-get install -y ca-certificates @@ -68,6 +79,3 @@ arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_ # The below sed command avoids the bug, and will be removed when no longer needed. PORTENTA_H7_BUGFIX_PATH=~/.arduino15/packages/arduino/hardware/mbed_portenta/3.1.1/cores/arduino/api/Common.h sed -i '3 i #include ' $PORTENTA_H7_BUGFIX_PATH - -# Cleanup -rm -f *.sh diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/base-box/base_box_setup.sh old mode 100644 new mode 100755 similarity index 100% rename from apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh rename to apps/microtvm/reference-vm/base-box/base_box_setup.sh diff --git a/apps/microtvm/reference-vm/base_box_setup_common.sh b/apps/microtvm/reference-vm/base-box/base_box_setup_common.sh similarity index 100% rename from apps/microtvm/reference-vm/base_box_setup_common.sh rename to apps/microtvm/reference-vm/base-box/base_box_setup_common.sh diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh b/apps/microtvm/reference-vm/base-box/base_box_test.sh similarity index 60% rename from apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh rename to apps/microtvm/reference-vm/base-box/base_box_test.sh index 5c3d96dfc7df8..a8a55a0f40ae5 100755 --- a/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh +++ b/apps/microtvm/reference-vm/base-box/base_box_test.sh @@ -16,25 +16,27 @@ # specific language governing permissions and limitations # under the License. # -# Usage: base_box_test.sh -# Execute microTVM Arduino tests. -# -set -e set -x if [ "$#" -lt 1 ]; then - echo "Usage: base_box_test.sh " + echo "Usage: base_box_test.sh " exit -1 fi -board=$1 +platform=$1 +board=$2 -pytest tests/micro/arduino/test_arduino_workflow.py --arduino-board=${board} +if [ "${platform}" == "zephyr" ]; then + pytest tests/micro/zephyr --zephyr-board=${board} +fi -if [ $board == "nano33ble" ]; then - # https://github.com/apache/tvm/issues/8730 - echo "NOTE: skipped test_arduino_rpc_server.py on $board -- known failure" -else - pytest tests/micro/arduino/test_arduino_rpc_server.py --arduino-board=${board} +if [ "${platform}" == "arduino" ]; then + pytest tests/micro/arduino/test_arduino_workflow.py --arduino-board=${board} + if [ $board == "nano33ble" ]; then + # https://github.com/apache/tvm/issues/8730 + echo "NOTE: skipped test_arduino_rpc_server.py on $board -- known failure" + else + pytest tests/micro/arduino/test_arduino_rpc_server.py --arduino-board=${board} + fi fi diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/provision_setup.sh old mode 100644 new mode 100755 similarity index 78% rename from apps/microtvm/reference-vm/zephyr/provision_setup.sh rename to apps/microtvm/reference-vm/provision_setup.sh index 785055a696583..f6237a82cd8bc --- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh +++ b/apps/microtvm/reference-vm/provision_setup.sh @@ -22,11 +22,10 @@ set -ex # NOTE: TVM is presumed to be mounted already by Vagrantfile. cd "${TVM_HOME}" -platform="zephyr" -apps/microtvm/reference-vm/rebuild-tvm.sh ${platform} +apps/microtvm/reference-vm/rebuild_tvm.sh # Build poetry -cd apps/microtvm/reference-vm/zephyr +cd apps/microtvm/reference-vm poetry env use 3.7 @@ -38,8 +37,8 @@ poetry install -E importer-mxnet poetry install poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt -echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile -echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/zephyr && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile +echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm" >>~/.profile +echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile echo "source \$VENV_PATH/bin/activate" >>~/.profile echo "export PATH=\"\${PATH}:\${HOME}/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin\"" >>~/.profile echo "export CMSIS_PATH=\"\${HOME}/cmsis\"" >>~/.profile diff --git a/apps/microtvm/reference-vm/rebuild-tvm.sh b/apps/microtvm/reference-vm/rebuild_tvm.sh similarity index 84% rename from apps/microtvm/reference-vm/rebuild-tvm.sh rename to apps/microtvm/reference-vm/rebuild_tvm.sh index ae58cb004c9ed..6fdf4fd917f48 100755 --- a/apps/microtvm/reference-vm/rebuild-tvm.sh +++ b/apps/microtvm/reference-vm/rebuild_tvm.sh @@ -16,19 +16,14 @@ # specific language governing permissions and limitations # under the License. # -# "Usage ./apps/microtvm/reference-vm/rebuild-tvm.sh " -# set -e -if [ "$#" -lt 1 -o "$1" == "--help" ]; then - echo "Usage ./apps/microtvm/reference-vm/rebuild-tvm.sh " +if [ "$1" == "--help" ]; then + echo "Usage ./apps/microtvm/reference-vm/rebuild_tvm.sh" exit -1 fi -platform=$1 -shift 1 - # Get number of cores for build if [ -n "${TVM_CI_NUM_CORES}" ]; then num_cores=${TVM_CI_NUM_CORES} @@ -39,7 +34,7 @@ fi cd "$(dirname $0)" cd "$(git rev-parse --show-toplevel)" -BUILD_DIR="build-microtvm-${platform}" +BUILD_DIR="build-microtvm" if [ ! -e "${BUILD_DIR}" ]; then mkdir "${BUILD_DIR}" diff --git a/apps/microtvm/reference-vm/scripts/reference_vm_build.sh b/apps/microtvm/reference-vm/scripts/reference_vm_build.sh index bac31a26cf786..bfbd8aaa26d4c 100755 --- a/apps/microtvm/reference-vm/scripts/reference_vm_build.sh +++ b/apps/microtvm/reference-vm/scripts/reference_vm_build.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -16,19 +16,14 @@ # specific language governing permissions and limitations # under the License. # -# Usage: apps/microtvm/reference-vm/scripts/reference_vm_build.sh -# -if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then - echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_build.sh " +if [ "$1" == "--help" -o "$1" == "-h" ]; then + echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_build.sh" exit -1 fi -PLATFORM=$1 -shift - cd "$(dirname "$0")" source "./utils.sh" || exit 2 cd ${RVM_BASE_PATH} -${BASE_BOX_TOOL} --provider=virtualbox build ${PLATFORM} +${BASE_BOX_TOOL} --provider=virtualbox build diff --git a/apps/microtvm/reference-vm/scripts/reference_vm_release.sh b/apps/microtvm/reference-vm/scripts/reference_vm_release.sh index 8719e2c05a9f0..beb271bd9e752 100755 --- a/apps/microtvm/reference-vm/scripts/reference_vm_release.sh +++ b/apps/microtvm/reference-vm/scripts/reference_vm_release.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -16,17 +16,12 @@ # specific language governing permissions and limitations # under the License. # -# Usage: apps/microtvm/reference-vm/scripts/reference_vm_release.sh -# -if [ "$#" -lt 3 -o "$1" == "--help" -o "$1" == "-h" ]; then - echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_release.sh " +if [ "$#" -lt 2 -o "$1" == "--help" -o "$1" == "-h" ]; then + echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_release.sh " exit -1 fi -PLATFORM=$1 -shift - RELEASE_NAME=$1 shift @@ -37,7 +32,7 @@ cd "$(dirname "$0")" source "./utils.sh" || exit 2 cd ${RVM_BASE_PATH} -${BASE_BOX_TOOL} --provider=virtualbox release ${PLATFORM} \ +${BASE_BOX_TOOL} --provider=virtualbox release \ --release-full-name=${RELEASE_NAME} \ --release-version=${RELEASE_VERSION} \ --skip-creating-release-version diff --git a/apps/microtvm/reference-vm/zephyr/.gitignore b/apps/microtvm/reference-vm/zephyr/.gitignore deleted file mode 100644 index dace7081e3f20..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/.vagrant diff --git a/apps/microtvm/reference-vm/zephyr/README.md b/apps/microtvm/reference-vm/zephyr/README.md deleted file mode 100644 index c5a1654c3ed31..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/README.md +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - - - - - - - -# microTVM Zephyr Reference Virtual Machine - -This directory contains setup files for Zephyr virtual machine used for testing microTVM platforms -that are supported by [Zephyr Project](https://zephyrproject.org/). - -## VM Information for Developers -Zephyr VM is published under [tlcpack](https://app.vagrantup.com/tlcpack). -Here is a list of different release versions and their tools. - -**Note**: We will release all microTVM RVM boxes under [microtvm-zephyr](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr) and use box versioning in Vagrant file. Previous versions like `microtvm-zephyr-2.5`, `microtvm-zephyr-2.4` are not continued and will be removed in future. - -## Versioning -We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Updating Zephyr SDK is considered a major change and it requires incrementing major version `X`. diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile deleted file mode 100644 index fb02f41d17d8f..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/Vagrantfile +++ /dev/null @@ -1,95 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -Vagrant.configure("2") do |config| - config.vm.box = "tlcpack/microtvm-zephyr" - config.vm.box_version = "2.0.0" - - if ENV.has_key?("TVM_RVM_NUM_CORES") - num_cores = ENV["TVM_RVM_NUM_CORES"] - else - num_cores = 2 - end - - if ENV.has_key?("TVM_RVM_RAM_BYTES") - ram_bytes = ENV["TVM_RVM_RAM_BYTES"] - else - ram_bytes = 2048 - end - - tvm_home = "../../../.." - dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())] - if ENV.has_key?("TVM_PROJECT_DIR") then - dirs_to_mount.append(ENV["TVM_PROJECT_DIR"]) - puts "NOTE: also configuring project dir: %s" % [dirs_to_mount[-1]] - end - - git_file = Pathname.new(tvm_home + "/.git") - if git_file.ftype() == "file" then - gitdir_match = Regexp.new('^gitdir: (?.*/.git).*\n$', Regexp::MULTILINE).match(git_file.read()) - if !gitdir_match.nil? then - dirs_to_mount.append(Pathname.new(tvm_home).realpath.join(gitdir_match.named_captures["gitdir"])) - puts "NOTE: also configuring git-worktree gitdir: %s" % [dirs_to_mount[-1]] - end - end - - config.vm.provision "shell", - path: "provision_setup.sh", - env: {"TVM_HOME": dirs_to_mount[0], - "TVM_CI_NUM_CORES": num_cores - }, - privileged: false - - # Enable USB Controller on VirtualBox - vm_name = "microtvm-#{Time.now.tv_sec}" - config.vm.provider "virtualbox" do |vb, overrides| - vb.name = vm_name - vb.cpus = num_cores - vb.memory = ram_bytes - vb.customize ["modifyvm", :id, "--usb", "on"] - vb.customize ["modifyvm", :id, "--usbehci", "on"] - vb.customize ["modifyvm", :id, "--usbxhci", "on"] - vb.customize [ "guestproperty", "set", :id, "/VirtualBox/GuestAdd/VBoxService/--timesync-set-threshold", 10000] - dirs_to_mount.each do |d| - overrides.vm.synced_folder d.to_s, d.to_s - end - end - - config.vm.provider "parallels" do |prl, overrides| - prl.name = vm_name - prl.cpus = num_cores - prl.memory = ram_bytes - prl.update_guest_tools = true - prl.customize ["set", :id, "--support-usb30", "on"] - dirs_to_mount.each do |d| - overrides.vm.synced_folder d.to_s, d.to_s, mount_options: ["share", "nosuid", "host_inodes"] - end - end - - config.vm.provider "vmware_desktop" do |vm, overrides| - vm.cpus = num_cores - vm.memory = ram_bytes - vm.vmx["usb_xhci.present"] = "TRUE" - vm.vmx["usb.present"] = "TRUE" - vm.vmx["ehci.present"] = "TRUE" - dirs_to_mount.each do |d| - overrides.vm.synced_folder d.to_s, d.to_s - end - vm.gui = true - end - -end diff --git a/apps/microtvm/reference-vm/zephyr/base-box/.gitignore b/apps/microtvm/reference-vm/zephyr/base-box/.gitignore deleted file mode 100644 index e4406c4f61e2d..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/base-box/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.box -.vagrant -/output-packer-* -/packer.json diff --git a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template deleted file mode 100644 index b43596bb83c17..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -Vagrant.configure("2") do |config| - # From hashicorp default template: - # https://github.com/hashicorp/packer/blob/master/builder/vagrant/step_create_vagrantfile.go#L23-L37 - - config.vm.define "source" do |source| - source.vm.box = "{{.SourceBox}}" - config.ssh.insert_key = {{.InsertKey}} - end - - config.vm.define "output" do |output| - output.vm.box = "{{.BoxName}}" - output.vm.box_url = "file://package.box" - config.ssh.insert_key = {{.InsertKey}} - end - - {{ if ne .SyncedFolder "" -}} - config.vm.synced_folder "{{.SyncedFolder}}", "/vagrant" - {{- else -}} - config.vm.synced_folder ".", "/vagrant", disabled: true - {{- end}} - - - {{ if eq .BoxName "microtvm-base-vmware_desktop" -}} - config.vm.provision "shell", inline: "touch ~/skip_zeroing_disk", privileged: false - {{- end}} - - # NOTE: base_box_setup.sh resides in the parent directory (../) because this template is expanded into a - # sub-directory of base-box (output-packer-*). - config.vm.provision "shell", path: "../base_box_setup.sh", privileged: false -end diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh deleted file mode 100644 index 2c55312f36577..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -e -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Using this script we can reuse docker/install scripts to configure the reference -# virtual machine similar to CI QEMU setup. -# - -set -e -set -x - -source ~/.profile - -# Init Zephyr -cd ~ -~/ubuntu_init_zephyr_project.sh ~/zephyr - -# Install CMSIS -cd ~ -~/ubuntu_install_cmsis.sh ~/cmsis - -# Cleanup -rm -f ubuntu_init_zephyr_project.sh ubuntu_install_cmsis.sh diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh deleted file mode 100755 index 49f86a6ef9ddd..0000000000000 --- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -e -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Usage: base_box_test.sh -# Execute microTVM Zephyr tests. -# - -set -e -set -x - -if [ "$#" -lt 1 ]; then - echo "Usage: base_box_test.sh " - exit -1 -fi - -board=$1 - -pytest tests/micro/zephyr --zephyr-board=${board} diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index 37b64433b23ee..099ba3c3fa5b2 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -149,10 +149,8 @@ "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64", # microTVM Virtual Machines "apps/microtvm/poetry.lock", - "apps/microtvm/reference-vm/arduino/Vagrantfile", - "apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template", - "apps/microtvm/reference-vm/zephyr/Vagrantfile", - "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template", + "apps/microtvm/reference-vm/Vagrantfile", + "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template", # Hexagon "src/runtime/hexagon/rpc/android_bash.sh.template", } From b4c1cc02eb9c5ef8a680a6fa4f8fb50a321b9539 Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Mon, 11 Jul 2022 19:58:47 +0100 Subject: [PATCH 086/111] [CMSIS-NN][Perf] Converted Relay Conv2D into CMSIS-NN Depthwise (#12006) --- apps/microtvm/zephyr_cmsisnn/CMakeLists.txt | 5 + .../backend/contrib/cmsisnn/convolutions.cc | 46 ++++++ .../backend/contrib/cmsisnn/convolutions.h | 60 ++++++++ .../contrib/cmsisnn/generate_constants.cc | 7 +- .../backend/contrib/cmsisnn/relay_to_tir.cc | 13 +- .../contrib/test_cmsisnn/test_conv2d.py | 144 +++++++++++++++++- 6 files changed, 260 insertions(+), 15 deletions(-) create mode 100644 src/relay/backend/contrib/cmsisnn/convolutions.cc create mode 100644 src/relay/backend/contrib/cmsisnn/convolutions.h diff --git a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt index b09e1d0642d2a..dd3582f86f7d7 100644 --- a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt +++ b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt @@ -53,6 +53,11 @@ set(DATA_FILES ) set(CMSIS_SOURCES ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c + ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c + ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c + ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c + ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c + ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c diff --git a/src/relay/backend/contrib/cmsisnn/convolutions.cc b/src/relay/backend/contrib/cmsisnn/convolutions.cc new file mode 100644 index 0000000000000..ebac83b812509 --- /dev/null +++ b/src/relay/backend/contrib/cmsisnn/convolutions.cc @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "convolutions.h" + +#include + +#include "../../../qnn/utils.h" +#include "tvm/ir/transform.h" +#include "tvm/relay/attrs/nn.h" + +namespace tvm { +namespace relay { +namespace contrib { +namespace cmsisnn { + +bool IsCMSISNNDepthwise(const Conv2DAttrs* conv2d_attrs, const Array& input_shape, + const Array& kernel_shape) { + std::string kernel_layout = conv2d_attrs->kernel_layout.c_str(); + int kernel_pos_o = kernel_layout.find("O"); + int kernel_pos_i = kernel_layout.find("I"); + int kernel_dim_o_val = qnn::get_const_int(kernel_shape[kernel_pos_o]); + int kernel_dim_i_val = qnn::get_const_int(kernel_shape[kernel_pos_i]); + int64_t out_channels = conv2d_attrs->channels.as()->value; + return (out_channels == kernel_dim_o_val * kernel_dim_i_val); +} + +} // namespace cmsisnn +} // namespace contrib +} // namespace relay +} // namespace tvm diff --git a/src/relay/backend/contrib/cmsisnn/convolutions.h b/src/relay/backend/contrib/cmsisnn/convolutions.h new file mode 100644 index 0000000000000..e635702bf3537 --- /dev/null +++ b/src/relay/backend/contrib/cmsisnn/convolutions.h @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/backend/contrib/cmsisnn/convolutions.h + * \brief CMSIS-NN utility functions for Convolutions + */ + +#ifndef TVM_RELAY_BACKEND_CONTRIB_CMSISNN_CONVOLUTIONS_H_ +#define TVM_RELAY_BACKEND_CONTRIB_CMSISNN_CONVOLUTIONS_H_ + +#include +#include +#include +#include +#include + +#include "../../../op/make_op.h" +#include "../../../qnn/utils.h" +#include "../../../transforms/pattern_utils.h" + +namespace tvm { +namespace relay { +namespace contrib { +namespace cmsisnn { +/*! + * \brief Checks if Relay Conv2D was originally CMSIS-NN compliant Depthwise Convolution + * See: + * https://github.com/apache/tvm/blob/6ed3ab3e33f8eafa4acaf53b7a671831de7587e9/python/tvm/relay/frontend/tflite.py#L2107 + * + * + * \return true if a Conv2D is a Depthwise Convolution based on Conv2D's inputs' shapes and + * attributes + */ + +bool IsCMSISNNDepthwise(const Conv2DAttrs* conv2d_attrs, const Array& input_shape, + const Array& kernel_shape); + +} // namespace cmsisnn +} // namespace contrib +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_BACKEND_CONTRIB_CMSISNN_CONVOLUTIONS_H_ diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc index 450bcf26d1b32..297e6b7acea33 100644 --- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc +++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc @@ -31,6 +31,7 @@ #include "../../../op/make_op.h" #include "../../../qnn/utils.h" #include "../../../transforms/pattern_utils.h" +#include "convolutions.h" namespace tvm { namespace relay { @@ -111,11 +112,7 @@ class GenerateConstantsMutator : public MixedModeMutator { Array input_shape = conv2d_call->args[0]->type_as()->shape; Array kernel_shape = conv2d_call->args[1]->type_as()->shape; - std::string kernel_layout = conv2d_attrs->kernel_layout.c_str(); - int kernel_pos_o = kernel_layout.find("O"); - int groups = conv2d_attrs->groups; - if (groups != qnn::get_const_int(input_shape[3]) || - groups != qnn::get_const_int(kernel_shape[kernel_pos_o])) { + if (!IsCMSISNNDepthwise(conv2d_attrs, input_shape, kernel_shape)) { // Transpose weights: HWIO -> OHWI for Conv2D conv2d_kernel = ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs); } diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc index 5c99061fa854b..d1d1d20d6e34e 100644 --- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc +++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc @@ -1,4 +1,3 @@ - /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -31,6 +30,7 @@ #include "../../../transforms/pattern_utils.h" #include "buffer_size.h" #include "compiler_attrs.h" +#include "convolutions.h" namespace tvm { namespace relay { @@ -173,7 +173,6 @@ class RelayToTIRVisitor : public MixedModeMutator { int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]); int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]); int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels); - int32_t groups = conv2d_attrs->groups; std::string kernel_layout = conv2d_attrs->kernel_layout.c_str(); int32_t clip_min = std::numeric_limits::min(); int32_t clip_max = std::numeric_limits::max(); @@ -207,11 +206,13 @@ class RelayToTIRVisitor : public MixedModeMutator { int32_t output_c = qnn::get_const_int(output_shape[3]); int32_t depth_multiplier = -1; - int kernel_pos_o = kernel_layout.find("O"); - if (groups == qnn::get_const_int(input_shape[3]) && - groups == qnn::get_const_int(filter_shape[kernel_pos_o])) { + if (IsCMSISNNDepthwise(conv2d_attrs, input_shape, filter_shape)) { + // Refer to TVM frontend to know how depth multiplier and out_channels are related + // https://github.com/apache/tvm/blob/6ed3ab3e33f8eafa4acaf53b7a671831de7587e9/python/tvm/relay/frontend/tflite.py#L2129 int kernel_pos_i = kernel_layout.find("I"); - depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_i]); + int kernel_pos_o = kernel_layout.find("O"); + int kernel_pos_dm = input_c == 1 ? kernel_pos_o : kernel_pos_i; + depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_dm]); } scalar_args.push_back(ToArg(depth_multiplier)); diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py index 462eb88347194..0b15c5a2466c5 100644 --- a/tests/python/contrib/test_cmsisnn/test_conv2d.py +++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py @@ -23,8 +23,13 @@ from tvm import relay from tvm.relay.op.contrib import cmsisnn -from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_models, compile_and_run - +from tvm.testing.aot import ( + generate_ref_data, + AOTTestModel, + compile_models, + compile_and_run, + run_and_check, +) from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER from .utils import ( make_module, @@ -84,13 +89,14 @@ def make_model( ) ) weight_const = relay.const(weight, kernel_dtype) + conv2d_kernel_sc = kernel_scale[0] if out_channels == 1 else kernel_scale conv = relay.qnn.op.conv2d( invar, weight_const, input_zero_point=relay.const(input_zero_point, "int32"), kernel_zero_point=relay.const(kernel_zero_point, "int32"), input_scale=relay.const(input_scale, "float32"), - kernel_scale=relay.const(kernel_scale, "float32"), + kernel_scale=relay.const(conv2d_kernel_sc, "float32"), kernel_size=(kernel_h, kernel_w), data_layout="NHWC", kernel_layout=weight_format, @@ -105,6 +111,7 @@ def make_model( bias_const = relay.const(bias, "int32") last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv requant_input_sc = [sc * input_scale for sc in kernel_scale] + requant_input_sc = requant_input_sc[0] if out_channels == 1 else requant_input_sc last_op = relay.qnn.op.requantize( last_op, relay.const(requant_input_sc, "float32"), @@ -209,7 +216,7 @@ def test_conv2d_number_primfunc_args( cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"] assert ( len(cmsisnn_func.params) == expected_num_params - ), "Generated unexpected number of function arguments" + ), "Generated unexpected number of function arguments." @tvm.testing.requires_cmsisnn @@ -540,6 +547,135 @@ def test_depthwise_int8( ) +@tvm.testing.requires_cmsisnn +@pytest.mark.parametrize("padding", ["SAME", "VALID"]) +@pytest.mark.parametrize("strides, dilation", [((1, 1), (1, 1))]) +@pytest.mark.parametrize("relu_type", ["RELU", "NONE"]) +@pytest.mark.parametrize("depth_multiplier", [1, 3]) +@pytest.mark.parametrize( + "input_zero_point, input_scale, kernel_scale", + [ + ( + 10, + 0.0128, + [0.11, 0.22], + ), + ( + -64, + 1, + [1, 0.0256, 1.37], + ), + ], +) +def test_relay_conv2d_cmsisnn_depthwise_int8( + padding, + strides, + dilation, + relu_type, + input_zero_point, + input_scale, + kernel_scale, + depth_multiplier, +): + """Tests QNN Depthwise int8 op via CMSIS-NN""" + interface_api = "c" + use_unpacked_api = True + test_runner = AOT_USMP_CORSTONE300_RUNNER + + dtype = "int8" + in_min, in_max = get_range_for_dtype_str(dtype) + + ifm_shape = (1, 24, 24, 1) + groups = ifm_shape[3] + weight_format = "HWIO" + (kernel_h, kernel_w) = (3, 3) + kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier) + out_channels = ifm_shape[3] * depth_multiplier + enable_bias = True + ks_len = len(kernel_scale) + kernel_zero_point = 0 + kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)] + + output_scale, output_zero_point = get_conv2d_qnn_params( + kernel_shape, + input_scale, + input_zero_point, + kernel_scale, + kernel_zero_point, + dtype, + dtype, + dtype, + True, + ) + + model, params = make_model( + ifm_shape, + kernel_shape, + input_zero_point, + input_scale, + kernel_zero_point, + kernel_scale, + output_zero_point, + output_scale, + padding, + strides, + dilation, + groups, + dtype, + dtype, + out_channels, + weight_format, + enable_bias, + relu_type, + ) + orig_mod = make_module(model) + cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params) + + # validate pattern matching + assert_partitioned_function(orig_mod, cmsisnn_mod) + + # generate reference output + rng = np.random.default_rng(12345) + inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)} + output_list = generate_ref_data(orig_mod["main"], inputs, params) + + # validate presence of depthwise convolution + compiled_models = compile_models( + AOTTestModel( + module=cmsisnn_mod, + inputs=inputs, + outputs=output_list, + params=params, + output_tolerance=1, + ), + interface_api, + use_unpacked_api, + pass_config=test_runner.pass_config, + ) + + cmsisnn_tir_mod = None + for target, mod in compiled_models[0].executor_factory.lowered_ir_mods.items(): + if target.kind.name == "cmsis-nn": + cmsisnn_tir_mod = mod + + cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"] + call_extern = None + if isinstance(cmsisnn_func.body, tvm.tir.stmt.Evaluate): + call_extern = cmsisnn_func.body.value + else: + call_extern = cmsisnn_func.body.body.value + assert ( + call_extern.args[0].value == "arm_depthwise_conv_wrapper_s8" + ), "Relay Conv2D should be mapped to CMSIS-NN Depthwise Convolution." + + # validate the output + run_and_check( + models=compiled_models, + runner=test_runner, + interface_api=interface_api, + ) + + def parameterize_for_invalid_model(test): """Generates non int8 inputs""" in_dtype = ["uint8", "int8"] From 5be8e0a3deccc8f68afb8c26230e0caf1b002de9 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Mon, 11 Jul 2022 13:57:55 -0700 Subject: [PATCH 087/111] [Collage] SubGraphs (#11981) * [Collage] SubGraphs See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md. Collage works in units of 'sub-graphs', which are potential partitions of the overall Relay model. This PR introduces SubGraph (an arbitrary partitioning, without any implication about how it is to be represented), it's companion SubSubGraph (implying a representation as a function), and some supporting odds 'n ends. * - make Integer <-> size_t conversion explicit - make 'Compiler' name explicit * - fix namespace ambiguity * - review comments --- CMakeLists.txt | 1 + src/relay/collage/README.md | 26 + src/relay/collage/dataflow_graph.cc | 48 + src/relay/collage/dataflow_graph.h | 77 ++ src/relay/collage/index_set.cc | 231 ++++ src/relay/collage/index_set.h | 128 +++ src/relay/collage/sub_graph.cc | 1034 ++++++++++++++++++ src/relay/collage/sub_graph.h | 452 ++++++++ src/relay/collage/utils.cc | 139 +++ src/relay/collage/utils.h | 86 ++ tests/python/relay/collage/test_sub_graph.py | 387 +++++++ 11 files changed, 2609 insertions(+) create mode 100644 src/relay/collage/README.md create mode 100644 src/relay/collage/dataflow_graph.cc create mode 100644 src/relay/collage/dataflow_graph.h create mode 100644 src/relay/collage/index_set.cc create mode 100644 src/relay/collage/index_set.h create mode 100644 src/relay/collage/sub_graph.cc create mode 100644 src/relay/collage/sub_graph.h create mode 100644 src/relay/collage/utils.cc create mode 100644 src/relay/collage/utils.h create mode 100644 tests/python/relay/collage/test_sub_graph.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 46de8f5d07fa0..8dc03ee0f40e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -296,6 +296,7 @@ tvm_file_glob(GLOB_RECURSE RELAY_OP_SRCS ) tvm_file_glob(GLOB_RECURSE RELAY_PASS_SRCS src/relay/analysis/*.cc + src/relay/collage/*.cc src/relay/transforms/*.cc src/relay/quantize/*.cc ) diff --git a/src/relay/collage/README.md b/src/relay/collage/README.md new file mode 100644 index 0000000000000..dc56496092cc0 --- /dev/null +++ b/src/relay/collage/README.md @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + +The `CollagePartition` pass for finding optimal partitionings of Relay models. + +See the [RFC](https://github.com/mbs-octoml/mbs-tvm-rfcs/blob/mbs-rfcs-collage/rfcs/xxxx-collage.md). + +Based on: +> *Collage: Automated Integration of Deep Learning Backends* +> Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia + +CAUTION: This is a prototype, do not use in prod. diff --git a/src/relay/collage/dataflow_graph.cc b/src/relay/collage/dataflow_graph.cc new file mode 100644 index 0000000000000..b4e19a73f04d3 --- /dev/null +++ b/src/relay/collage/dataflow_graph.cc @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/dataflow_graph.cc + * \brief A representation of the dataflow for an overall Relay expression. + */ + +#include "./dataflow_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +DataflowGraph::DataflowGraph(Expr expr) : expr_(std::move(expr)) { + indexed_graph_ = CreateIndexedGraph(expr_); + downstream_map_.reserve(indexed_graph_->size()); + for (PostDfsIndex index = 0; index < indexed_graph_->size(); ++index) { + const Node* node = indexed_graph_->index_to_node(index); + std::unordered_set downstream_nodes; + node->AccumulateDownstreamNodes(&downstream_nodes); + IndexSet index_set(indexed_graph_->size()); + for (const Node* downstream_node : downstream_nodes) { + index_set.Add(downstream_node->index_); + } + downstream_map_.emplace_back(std::move(index_set)); + } +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/dataflow_graph.h b/src/relay/collage/dataflow_graph.h new file mode 100644 index 0000000000000..c3c22381a8892 --- /dev/null +++ b/src/relay/collage/dataflow_graph.h @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/dataflow_graph.h + * \brief A representation of the dataflow for an overall Relay expression. + */ +#ifndef TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_ +#define TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_ + +#include + +#include +#include + +#include "../ir/indexed_graph.h" +#include "./index_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Represents the dataflow of an overall Relay expression. + */ +class DataflowGraph { + public: + using Node = IndexedGraph::Node; + + explicit DataflowGraph(Expr expr); + + size_t size() const { return indexed_graph_->size(); } + const Node* index_to_node(PostDfsIndex index) const { + return indexed_graph_->index_to_node(index); + } + const Node* item_to_node(const Expr& expr) const { return indexed_graph_->item_to_node(expr); } + const Node* item_to_node(const ExprNode* expr_node) const { + return indexed_graph_->item_to_node(expr_node); + } + const Expr& expr() const { return expr_; } + const IndexedGraph& indexed_graph() const { return *indexed_graph_; } + + const IndexSet& downstream_of(PostDfsIndex index) const { + ICHECK_LT(index, indexed_graph_->size()); + return downstream_map_[index]; + } + + private: + /*! \brief The overall expression. */ + Expr expr_; + /*! \brief The indexed graph which captures the main dataflow. */ + std::unique_ptr> indexed_graph_; + /*! \brief Map from a node's PostDfsIndex to the set of its downstream dataflow node indexes. */ + std::vector downstream_map_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_ diff --git a/src/relay/collage/index_set.cc b/src/relay/collage/index_set.cc new file mode 100644 index 0000000000000..55bec80820a47 --- /dev/null +++ b/src/relay/collage/index_set.cc @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/index_set.cc + * \brief Efficient representation of a set of post-dfs indexes. + */ + +#include "./index_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +// TODO(mbs): These should operate one-word-at-a-time + +IndexSet::IndexSet(size_t size, const std::vector& indexes) : bitvec_(size, false) { + for (size_t index : indexes) { + ICHECK_LT(index, bitvec_.size()); + ICHECK(!bitvec_[index]); + bitvec_[index] = true; + } +} + +IndexSet IndexSet::operator&(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + std::vector result(bitvec_.size(), false); + for (size_t index = 0; index < bitvec_.size(); ++index) { + result[index] = bitvec_[index] && that.bitvec_[index]; + } + return IndexSet(result); +} + +IndexSet IndexSet::operator|(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + std::vector result(bitvec_.size(), false); + for (size_t index = 0; index < bitvec_.size(); ++index) { + result[index] = bitvec_[index] || that.bitvec_[index]; + } + return IndexSet(result); +} + +IndexSet IndexSet::operator-(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + std::vector result(bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); ++index) { + result[index] = bitvec_[index] && !that.bitvec_[index]; + } + return IndexSet(result); +} + +bool IndexSet::AreDisjoint(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && that.bitvec_[index]) { + return false; + } + } + return true; +} + +bool IndexSet::IsSubset(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && !that.bitvec_[index]) { + return false; + } + } + return true; +} + +bool IndexSet::Intersects(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && that.bitvec_[index]) { + return true; + } + } + return false; +} + +IndexSet IndexSet::Subst(size_t new_size, const IndexSubst& subst) const { + std::vector result(new_size, false); + for (PostDfsIndex index = 0; index < bitvec_.size(); ++index) { + if (!bitvec_[index]) { + continue; + } + auto itr = subst.find(index); + ICHECK(itr != subst.end()); + PostDfsIndex new_index = itr->second; + ICHECK(new_index < new_size); + ICHECK(!result[new_index]); + result[new_index] = true; + } + return IndexSet(result); +} + +size_t IndexSet::PopCount() const { + size_t n = 0; + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + ++n; + } + } + return n; +} + +bool IndexSet::IsZero() const { + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + return false; + } + } + return true; +} + +size_t IndexSet::FirstInsideIndex() const { + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +size_t IndexSet::LastInsideIndex() const { + for (size_t i = bitvec_.size(); i > 0; i--) { + const size_t index = i - 1; + if (bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +size_t IndexSet::NextIndex(size_t index) const { + ICHECK_LT(index, bitvec_.size()); + for (index++; index < bitvec_.size(); index++) { + if (bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +size_t IndexSet::FirstOutsideIndex() const { + for (size_t index = 0; index < bitvec_.size(); index++) { + if (!bitvec_[index]) { + return index; + } + } + return bitvec_.size(); +} + +bool IndexSet::operator==(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + return bitvec_ == that.bitvec_; +} + +bool IndexSet::operator!=(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + return bitvec_ != that.bitvec_; +} + +bool IndexSet::operator<(const IndexSet& that) const { + ICHECK_EQ(bitvec_.size(), that.bitvec_.size()); + for (size_t index = 0; index < bitvec_.size(); index++) { + if (bitvec_[index] && !that.bitvec_[index]) { + return true; + } + if (!bitvec_[index] && that.bitvec_[index]) { + return false; + } + } + return false; +} + +size_t IndexSet::hash() const { + std::hash> h; + return h(bitvec_); +} + +std::string IndexSet::ToString() const { + std::ostringstream os; + os << "{"; + bool first = true; + for (size_t start = 0; start < bitvec_.size(); /*no-op*/) { + if (!bitvec_[start]) { + ++start; + continue; + } + size_t end; + for (end = start + 1; end < bitvec_.size() && bitvec_[end]; ++end) { + /*no-op*/ + } + if (first) { + first = false; + } else { + os << ","; + } + os << start; + if (end > start + 2) { + os << ".." << (end - 1); + start = end; + } else { + ++start; + } + } + os << "}"; + return os.str(); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/index_set.h b/src/relay/collage/index_set.h new file mode 100644 index 0000000000000..f24b695cc76c9 --- /dev/null +++ b/src/relay/collage/index_set.h @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/index_set.h + * \brief Efficient representation of a set of post-dfs indexes. + */ + +#ifndef TVM_RELAY_COLLAGE_INDEX_SET_H_ +#define TVM_RELAY_COLLAGE_INDEX_SET_H_ + +#include +#include +#include +#include + +#include "../ir/dataflow_matcher_impl.h" +#include "../ir/indexed_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +using IndexSubst = std::unordered_map; + +class IndexSet { + public: + IndexSet() = default; + explicit IndexSet(size_t size) : bitvec_(size, false) {} + IndexSet(size_t size, const std::vector& indexes); + + IndexSet operator&(const IndexSet& that) const; + IndexSet operator|(const IndexSet& that) const; + IndexSet operator-(const IndexSet& that) const; + bool AreDisjoint(const IndexSet& that) const; + bool IsSubset(const IndexSet& that) const; + bool Intersects(const IndexSet& that) const; + + bool operator[](size_t index) const { + ICHECK_LT(index, bitvec_.size()); + return bitvec_[index]; + } + + IndexSet& Add(size_t index) { + ICHECK_LT(index, bitvec_.size()); + bitvec_[index] = true; + return *this; + } + + IndexSet Subst(size_t new_size, const IndexSubst& subst) const; + + size_t end_index() const { return bitvec_.size(); } + size_t PopCount() const; + bool IsZero() const; + size_t FirstInsideIndex() const; + size_t LastInsideIndex() const; + size_t NextIndex(size_t index) const; + size_t FirstOutsideIndex() const; + bool operator==(const IndexSet& that) const; + bool operator!=(const IndexSet& that) const; + bool operator<(const IndexSet& that) const; + size_t hash() const; + std::string ToString() const; + + struct IndexSetIterator { + const IndexSet* set; + size_t i; + + size_t operator*() const { + ICHECK_LT(i, set->end_index()); + return i; + } + + const IndexSetIterator& operator++() { + ICHECK_LT(i, set->end_index()); + i = set->NextIndex(i); + return *this; + } + + bool operator==(const IndexSetIterator& that) const { + ICHECK(set == that.set); + return i == that.i; + } + + bool operator!=(const IndexSetIterator& that) const { + ICHECK(set == that.set); + return i != that.i; + } + }; + + IndexSetIterator begin() const { return IndexSetIterator{this, FirstInsideIndex()}; } + IndexSetIterator end() const { return IndexSetIterator{this, end_index()}; } + + private: + explicit IndexSet(std::vector bitvec) : bitvec_(std::move(bitvec)) {} + + std::vector bitvec_; +}; + +struct IndexSetEqual { + bool operator()(const IndexSet& left, const IndexSet& right) const { return left == right; } +}; + +struct IndexSetHash { + size_t operator()(const IndexSet& set) const { return set.hash(); } +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_INDEX_SET_H_ diff --git a/src/relay/collage/sub_graph.cc b/src/relay/collage/sub_graph.cc new file mode 100644 index 0000000000000..63edc8c079fbc --- /dev/null +++ b/src/relay/collage/sub_graph.cc @@ -0,0 +1,1034 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/sub_graph.cc + * \brief Represents a sub-graph of an overall Relay expression. + */ + +#include "./sub_graph.h" + +#include + +#include "../../support/scalars.h" +#include "../transforms/pass_utils.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +namespace { + +class Extractor; + +/*! + * \brief Helper class for rewriting expressions to replace a sub-graph according to the + * given extractor. + */ +class Rewriter : public ExprMutator { + public: + explicit Rewriter(const Extractor* extractor) : extractor_(extractor) {} + + Expr VisitExpr(const Expr& expr) final; + + private: + /*! \brief Already prepared extractor which will guide the rewrite. */ + const Extractor* extractor_; +}; + +/*! \brief Helper class for extracting matched sub-graphs from the overall expression. */ +class Extractor : public ExprMutator { + public: + Extractor(const DataflowGraph* dataflow_graph, const SubGraphNode* sub_graph, + FunctionAttrsMap opt_attrs) + : dataflow_graph_(dataflow_graph), sub_graph_(sub_graph), opt_attrs_(std::move(opt_attrs)) { + ICHECK_EQ(dataflow_graph_->size(), sub_graph_->overall_size()); + } + + const DataflowGraph& dataflow_graph() const { return *dataflow_graph_; } + + /*! + * \brief Collect the parameters and output expressions for the function representing + * the sub-graph. + */ + void Extract() { + ICHECK(!sub_graph_->IsEmpty()); + VLOG(2) << "Extracting " << sub_graph_->ToString(); + const bool for_function = opt_attrs_.defined(); + + // In reverse dataflow order... + for (PostDfsIndex i = dataflow_graph_->size(); i > 0; --i) { + PostDfsIndex index = i - 1; + if (!sub_graph_->inside_[index]) { + // Node is outside sub-graph. + continue; + } + VLOG(2) << "index " << index; + auto node = dataflow_graph_->index_to_node(index); + if (sub_graph_->exit_[node->index_] || node->is_external_ || memo_.count(node->ref()) == 0) { + // This sub-expression is: + // - inside the sub-graph and needed outside the sub-graph. So it must contribute to an + // output (even if we've already visited it while constructing an output from a + // downstream sub-expression). + // - not yet visited, in which case it must still be considered an 'output' so it will + // be evaluated for any possible side effects. + Expr output = VisitExpr(GetRef(node->node_ref_)); + VLOG(2) << "index " << index << " added as output:\n" + << PrettyPrint(output) << "\nat " << outputs_.size(); + expr_to_output_index_.emplace(node->node_ref_, outputs_.size()); + outputs_.emplace_back(std::move(output)); + output_types_.emplace_back(node->node_ref_->checked_type()); + } + } + ICHECK(!outputs_.empty()); + + // Reverse the outputs so as to preserve the original evaluation order. + std::reverse(outputs_.begin(), outputs_.end()); + std::reverse(output_types_.begin(), output_types_.end()); + for (auto& kv : expr_to_output_index_) { + kv.second = static_cast(outputs_.size()) - 1 - kv.second; + } + + // Build a 'body' expression to represent the extracted sub-graph. If we have multiple + // outputs we'll place them in a tuple. + Type body_type; + Expr body; + if (outputs_.size() > 1) { + body_type = TupleType(output_types_); + body = Tuple(outputs_); + body->checked_type_ = body_type; + } else { + body_type = output_types_.front(); + body = outputs_.front(); + } + + // Re-express all the nested sub-graphs in terms of the body. + DataflowGraph body_dataflow_graph(body); + std::vector nested_sub_graphs; + IndexSubst subst = MakeIndexSubst(body_dataflow_graph); + for (const auto& nested_sub_graph : sub_graph_->nested_sub_graphs_) { + nested_sub_graphs.emplace_back(nested_sub_graph.Subst(body_dataflow_graph, subst)); + } + + // Sweep backwards through the body, rewriting to account for each nested sub-graph. + body = NestedSubGraph::ParallelRewrite(body_dataflow_graph, body, std::move(nested_sub_graphs)); + + if (for_function) { + // Rewrite so all input nodes are now conveyed via call arguments to a new function. + Array arg_types; + arg_types.reserve(params_.size()); + for (const auto& param : params_) { + arg_types.push_back(param->checked_type()); + } + extracted_ = Function(std::move(params_), std::move(body), body_type, + /*ty_params=*/{}, DictAttrs(opt_attrs_)); + extracted_->checked_type_ = + FuncType(std::move(arg_types), body_type, /*type_params=*/{}, /*type_constraints=*/{}); + body = Call(extracted_, std::move(args_)); + body->checked_type_ = body_type; + } else { + // Don't do anything with the inputs. + extracted_ = body; + } + + // Setup the output substitution. + for (const auto& kv : expr_to_output_index_) { + Expr expr; + if (outputs_.size() == 1) { + expr = body; + } else if (for_function) { + expr = TupleGetItem(body, kv.second); + expr->checked_type_ = output_types_[kv.second]; + } else { + const auto* tuple_node = body.as(); + ICHECK(tuple_node); + expr = tuple_node->fields[kv.second]; + } + VLOG(2) << "output " << dataflow_graph_->item_to_node(kv.first)->index_ << " is at index " + << kv.second << " (of " << outputs_.size() << " outputs)"; + output_substitution_.emplace(kv.first, std::move(expr)); + } + } + + ////// Following members are valid only after Extract() has returned. + + /*! + * \brief Returns the expression representing the extracted sub-graph. If opt_attrs_ is + * defined then will be a function. + */ + Expr extracted() const { return extracted_; } + + /*! + * \brief Returns the substitution to apply to all expression nodes in the overall expression + * so as to replace references to outputs of the sub-graph with their rewritten form. + */ + const std::unordered_map& output_substitution() const { + return output_substitution_; + } + + private: + /*! + * \brief Returns a map from original index to new index for each node inside the sub-graph. Only + * valid after \p Extract has made its backwards dataflow sweep. + */ + IndexSubst MakeIndexSubst(const DataflowGraph& new_dataflow_graph) const { + VLOG(2) << "building extractor substitution"; + IndexSubst subst; + for (PostDfsIndex index : sub_graph_->inside_) { + auto orig_node = dataflow_graph_->index_to_node(index); + ICHECK_EQ(orig_node->index_, index); + auto itr = memo_.find(orig_node->ref()); + ICHECK(itr != memo_.end()); + auto new_node = new_dataflow_graph.item_to_node(itr->second); + VLOG(2) << orig_node->index_ << " |-> " << new_node->index_; + subst.emplace(orig_node->index_, new_node->index_); + } + return subst; + } + + /*! \brief Returns true if \p expr is inside the sub-graph. */ + bool inside(const Expr& expr) { + return sub_graph_->inside_[dataflow_graph_->item_to_node(expr)->index_]; + } + + /*! + * \brief Returns the variable uniquely representing \p expr, which should be + * an input node (ie outside the sub-graph but feeding into a node inside the sub-graph). + * + * It is valid for: + * - An expression outside the sub-graph to be used multiple times inside the sub-graph. + * - An expression outside the sub-graph to be used both inside and outside the sub-graph. + */ + Var VarFor(const Expr& expr) { + ICHECK(!inside(expr)); + ICHECK(opt_attrs_.defined()); + auto itr = expr_to_param_.find(expr.get()); + if (itr != expr_to_param_.end()) { + return itr->second; + } + auto fresh_var = Var("FunctionVar_" + std::to_string(params_.size()), expr->checked_type()); + fresh_var->checked_type_ = expr->checked_type(); + params_.push_back(fresh_var); + args_.push_back(expr); + expr_to_param_.emplace(expr.get(), fresh_var); + return fresh_var; + } + + /*! + * \brief If \p expr is inside the sub-graph then return it's rewritten form. + * If \p expr is outside the sub-graph then it must correspond to an input node. + * - If opt_attrs_ is defined return the variable to represent it. + * - Otherwise just return the expression directly. + * + * Should be called only on inputs to nodes which are inside the sub-graph. + */ + Expr VisitExpr(const Expr& expr) final { + if (inside(expr)) { + return ExprMutator::VisitExpr(expr); + } else if (CanInline(expr)) { + // Implicitly include inlinable input sub-expressions. + return expr; + } else if (opt_attrs_.defined()) { + // Map to a function parameter. + return VarFor(expr); + } else { + // Stop rewriting. + return expr; + } + } + + Expr VisitExpr_(const FunctionNode* function_node) override { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + return GetRef(function_node); + } + return ExprMutator::VisitExpr_(function_node); + } + + //// Context fields, passed in constructor. + + /*! \brief The dataflow graph corresponding to the overall expression. */ + const DataflowGraph* dataflow_graph_; + /*! \brief The sub-graph of the above we are extracting. */ + const SubGraphNode* sub_graph_; + /*! \brief Optional attributes if the sub-graph should be extracted as a function. */ + FunctionAttrsMap opt_attrs_; + + //// Result fields, available after Extract() called. + + /*! + * \brief The extracted expression. If opt_attrs_ is defined this will be a function. + */ + Expr extracted_; + /*! + * \brief Map from output nodes to corresponding expressions. If the sub-graph has more than + * one exit node then each entry will be a tuple projection. + */ + std::unordered_map output_substitution_; + + //// Accumulator fields, built as we visit expressions. + + /*! \brief (If opt_attrs_ is defined) Parameters representing input expression nodes. */ + Array params_; + /*! + * \brief (If opt_attrs_ is defined) The input expression nodes for each of the above params_. + */ + Array args_; + /*! + * \brief (If opt_attrs_ is defined) Map from existing input expression nodes to the parameters + * in params_ which now representing them. + */ + std::unordered_map expr_to_param_; + /*! + * \brief Accumulated new expressions which represent the exit nodes of the rewritten sub-graph. + * It is possible to have multiple outputs. It is possible one output also contributes to other + * outputs (ie the output is a 'tap'). + */ + std::vector outputs_; + /*! \brief (If opt_attrs_ is defined) Types of original expressions corresponding to outputs_. */ + std::vector output_types_; + /*! + * \brief Map from existing exit expression nodes to the index in outputs_ which should + * represent them in the rewritten overall expression. + */ + std::unordered_map expr_to_output_index_; +}; + +Expr Rewriter::VisitExpr(const Expr& expr) { + auto itr = extractor_->output_substitution().find(expr.get()); + if (itr == extractor_->output_substitution().end()) { + return ExprMutator::VisitExpr(expr); + } else { + return itr->second; + } +} + +} // namespace + +std::pair SubExprKindAndLabel(const Expr& sub_expr) { + class Visitor : public ExprFunctor(const Expr&)> { + private: + std::pair VisitExpr_(const CallNode* call_node) final { + if (const auto* op_node = call_node->op.as()) { + auto op = GetRef(op_node); + static auto fpattern = Op::GetAttrMap("TOpPattern"); + if (fpattern.count(op) == 0) { + VLOG(1) << "no TOpPattern known for " << op->name << ", considering opaque"; + return {kOpaque, op->name}; + } else if (IsDynamic(call_node->checked_type()) && IsDataDependent(call_node)) { + VLOG(1) << "call has dynamic shape which is data-dependent, considering opaque"; + return {kOpaque, op->name}; + } else { + OpPatternKind kind = static_cast(fpattern[op]); + VLOG(2) << "TOpPattern for " << op->name << " is " << KindToString(kind); + return {kind, op->name}; + } + } else if (const auto* function_node = call_node->op.as()) { + Optional opt_i = + function_node->GetAttr("TOpPattern", Optional()); + if (opt_i.defined()) { + OpPatternKind kind = static_cast(opt_i.value()->value); + VLOG(1) << "TOpPattern for function is " << KindToString(kind); + return {kind, "call_prim"}; + } else { + VLOG(1) << "calling function without TOpPattern, considering opaque"; + return {kOpaque, "call_fun"}; + } + } else { + VLOG(1) << "unsupported call, considering opaque"; + return {kOpaque, "call_any"}; + } + } + + std::pair VisitExpr_(const ConstantNode* constant_node) final { + VLOG(2) << "TOpPattern for constant is " << KindToString(kElemWise); + if (support::IsSimpleScalar(constant_node)) { + return {kElemWise, "scalar"}; + } else { + return {kElemWise, "const"}; + } + } + + std::pair VisitExpr_(const TupleNode* tuple_node) final { + const auto* tuple_type_node = tuple_node->checked_type().as(); + ICHECK(tuple_type_node != nullptr); + if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(), + [](const Type& type) { return type.as() != nullptr; })) { + VLOG(2) << "TOpPattern for tuple is " << KindToString(kInjective); + return {kInjective, "tuple"}; + } else { + VLOG(1) << "tuple contains non-tensors, considering opaque"; + return {kOpaque, "tuple"}; + } + } + + std::pair VisitExpr_( + const TupleGetItemNode* tuple_get_item_node) final { + const auto* tuple_type_node = tuple_get_item_node->tuple->checked_type().as(); + ICHECK(tuple_type_node != nullptr); + if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(), + [](const Type& type) { return type.as() != nullptr; })) { + VLOG(2) << "TOpPattern for tuple projection is " << KindToString(kInjective); + return {kInjective, "proj"}; + } else { + VLOG(1) << "tuple being projected contains non-tensors, considering opaque"; + return {kOpaque, "proj"}; + } + } + + // TODO(mbs): We implement the following mostly so we have a lightweight way of describing + // the current sub-expression. If partitioning is ever extended beyond the usual call/tuple/proj + // sub-language we should revise the returned operator kinds to match. + + std::pair VisitExpr_(const VarNode* var_node) final { + return {kOpaque, "%" + var_node->name_hint()}; + } + std::pair VisitExpr_(const GlobalVarNode* global_var_node) final { + return {kOpaque, "@" + global_var_node->name_hint}; + } + std::pair VisitExpr_(const OpNode* op_node) final { + return {kOpaque, "`" + op_node->name}; + } + std::pair VisitExpr_(const FunctionNode* function_node) final { + return {kOpaque, "fn"}; + } + std::pair VisitExpr_(const LetNode* let_node) final { + return {kOpaque, "let"}; + } + std::pair VisitExpr_(const IfNode* if_node) final { + return {kOpaque, "if"}; + } + std::pair VisitExpr_(const RefCreateNode* ref_create_node) final { + return {kOpaque, "ref"}; + } + std::pair VisitExpr_(const RefReadNode* op) final { + return {kOpaque, "ref_read"}; + } + std::pair VisitExpr_(const RefWriteNode* op) final { + return {kOpaque, "ref_write"}; + } + std::pair VisitExpr_(const ConstructorNode* op) final { + return {kOpaque, "`" + op->name_hint}; + } + std::pair VisitExpr_(const MatchNode* op) final { + return {kOpaque, "match"}; + } + }; + return Visitor().VisitExpr(sub_expr); +} + +std::pair SubGraphKindAndLabel(const DataflowGraph& dataflow_graph, + const IndexSet& inside) { + std::ostringstream os; + bool first = true; + OpPatternKind max_kind = kElemWise; + for (PostDfsIndex index : inside) { + OpPatternKind sub_kind; + std::string sub_label; + std::tie(sub_kind, sub_label) = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref()); + if (!sub_label.empty()) { + if (first) { + first = false; + } else { + os << "+"; + } + os << sub_label; + } + max_kind = CombineKinds(max_kind, sub_kind); + } + return {max_kind, os.str()}; +} + +IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher) { + IndexSet result(matcher.size()); + for (const auto& kv : matcher.memo()) { + for (const auto& matched_sub_expr : kv.second) { + if (CanInline(matched_sub_expr)) { + // Trivial sub-expressions can just be included in the extracted function body + // when we construct it and don't need to be considered part of the sub-graph. + continue; + } + if (kv.first.as()) { + // Don't consider the expressions matched by a wildcard to be part of the sub-graph. + continue; + } + result.Add(matcher.expr_to_node(matched_sub_expr)->index_); + } + } + return result; +} + +std::string SubGraphConfig::ToString() const { + std::ostringstream os; + os << "{max_exits=" << max_exits; + os << ", allow_taps=" << allow_taps; + os << ", max_depth=" << max_depth; + os << "}"; + return os.str(); +} + +TVM_REGISTER_NODE_TYPE(NestedSubGraphNode); + +void NestedSubGraphNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +SubGraph NestedSubGraphNode::sub_graph() const { return Downcast(sub_graph_obj_); } + +bool NestedSubGraphNode::operator==(const NestedSubGraphNode& that) const { + return *sub_graph().get() == *that.sub_graph().get(); +} + +bool NestedSubGraphNode::operator<(const NestedSubGraphNode& that) const { + return *sub_graph().get() < *that.sub_graph().get(); +} + +size_t NestedSubGraphNode::hash() const { + size_t h = StructuralHash()(attrs_); + h ^= sub_graph()->hash() + 0x9e3779b9 + (h << 6) + (h >> 2); + return h; +} + +std::string NestedSubGraphNode::ToString() const { + std::ostringstream os; + os << "{sub_graph=" << sub_graph()->ToString(); + os << ", attrs=" << PrettyPrint(attrs_); + os << "}"; + return os.str(); +} + +Function NestedSubGraphNode::Extract(const DataflowGraph& dataflow_graph) const { + Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_); + extractor.Extract(); + return Downcast(extractor.extracted()); +} + +Expr NestedSubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const { + Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_); + extractor.Extract(); + Rewriter rewriter(&extractor); + return rewriter.VisitExpr(expr); +} + +NestedSubGraph::NestedSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs) { + auto data = runtime::make_object(); + data->sub_graph_obj_ = std::move(sub_graph); + data->attrs_ = std::move(attrs); + data_ = std::move(data); +} + +NestedSubGraph NestedSubGraph::Subst( + const DataflowGraph& new_dataflow_graph, + const std::unordered_map& subst) const { + return NestedSubGraph(get()->sub_graph().Subst(new_dataflow_graph, subst), get()->attrs_); +} + +bool NestedSubGraph::TriviallyUnionable(const NestedSubGraph& that) const { + if (get()->attrs_.size() != that->attrs_.size()) { + return false; + } + for (const auto& kv : get()->attrs_) { + if (kv.first == "Composite") { + // Even if all the attributes agree we don't consider "Composite" functions to + // ever be unionable. + // TODO(mbs): Find a cleaner way to do this. + return false; + } + auto itr = that->attrs_.find(kv.first); + if (itr == that->attrs_.end()) { + return false; + } + if (!StructuralEqual()(kv.second, (*itr).second)) { + return false; + } + } + return true; +} + +NestedSubGraph NestedSubGraph::DisjointUnion(const DataflowGraph& dataflow_graph, + const NestedSubGraph& that) const { + ICHECK(TriviallyUnionable(that)); + return NestedSubGraph(get()->sub_graph().DisjointUnion(dataflow_graph, that->sub_graph()), + get()->attrs_); +} + +/*static*/ +Expr NestedSubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + std::vector nested_sub_graphs) { + // IMPORTANT: See the corresponding comment in SubGraph::ParallelRewrite. + std::sort(nested_sub_graphs.begin(), nested_sub_graphs.end(), + [](const NestedSubGraph& left, const NestedSubGraph& right) { + return left->sub_graph()->last_inside_index_ > right->sub_graph()->last_inside_index_; + }); + + Expr result = expr; + for (const auto& nested_sub_graph : nested_sub_graphs) { + result = nested_sub_graph->Rewrite(dataflow_graph, result); + } + return result; +} + +TVM_REGISTER_NODE_TYPE(SubGraphNode); + +void SubGraphNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +IndexSet SubGraphNode::Downstream(const DataflowGraph& dataflow_graph) const { + IndexSet downstream(dataflow_graph.size()); + for (PostDfsIndex exit_index : exit_) { + downstream = downstream | dataflow_graph.downstream_of(exit_index); + } + return downstream; +} + +bool SubGraphNode::IsValid(const DataflowGraph& dataflow_graph, + const SubGraphConfig& config) const { + // Check we don't have too many exit nodes. + if (config.max_exits > 0 && exit_.PopCount() > config.max_exits) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: " << exit_.PopCount() + << " exits exceeds maximum " << config.max_exits; + return false; + } + + // Check the maximum path depth is in limit. + if (config.max_depth > 0 && depth_ > config.max_depth) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: maximum depth " << depth_ + << " exceeds limit " << config.max_depth; + return false; + } + + // All inside nodes must be in the same basic block. + const DataflowGraph::Node* basic_block = nullptr; + for (PostDfsIndex index : inside_) { + auto node = dataflow_graph.index_to_node(index); + if (basic_block == nullptr) { + basic_block = node->basic_block_; + } + if (node->basic_block_ != basic_block) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: nodes are from different basic blocks"; + return false; + } + } + + // The nested sub-graphs must be subsets and non-overlapping. + IndexSet union_inside(dataflow_graph.size()); + for (const auto& nested_sub_graph : nested_sub_graphs_) { + if (!nested_sub_graph->sub_graph()->inside_.AreDisjoint(union_inside)) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: nested sub-graphs overlap"; + return false; + } + if (!nested_sub_graph->sub_graph()->inside_.IsSubset(inside_)) { + VLOG(1) << "Subgraph " << ToString() + << " is invalid: nested sub-graph is not subset of overall sub-graph"; + return false; + } + } + + if (!config.allow_taps) { + // Exit nodes cannot also contribute to inside nodes. + for (PostDfsIndex index : exit_) { + auto node = dataflow_graph.index_to_node(index); + if (AnyOutputInside(node)) { + VLOG(1) << "Subgraph " << ToString() + << " is invalid: inner node is 'tapped' and also contributes to output, but taps " + "are disabled"; + return false; + } + } + } + + // Check no output would end up feeding into any entry node. + for (PostDfsIndex output_index : output_) { + if (dataflow_graph.downstream_of(output_index).Intersects(entry_)) { + VLOG(1) << "Subgraph " << ToString() << " is invalid: output node " << output_index + << " feeds back into this sub-graph"; + return false; + } + } + + // Looks legit! + return true; +} + +Function SubGraphNode::ExtractAsFunction(const DataflowGraph& dataflow_graph) const { + NestedSubGraph nested_sub_graph(GetRef(this), FunctionAttrsMap()); + return nested_sub_graph->Extract(dataflow_graph); +} + +Expr SubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const { + if (nested_sub_graphs_.empty()) { + // Nothing to rewrite. + return expr; + } + Extractor extractor(&dataflow_graph, this, NullValue()); + extractor.Extract(); + Rewriter rewriter(&extractor); + return rewriter.VisitExpr(expr); +} + +std::string SubGraphNode::ToString() const { + std::ostringstream os; + os << "{inside=" << inside_.ToString(); + os << ", entry=" << entry_.ToString(); + os << ", exit=" << exit_.ToString(); + os << ", input=" << input_.ToString(); + os << ", output=" << output_.ToString(); + os << ", depth=" << depth_; + os << ", kind=" << KindToString(kind_); + if (!label_.empty()) { + os << ", label=" << label_; + } + for (const auto& nested_sub_graph : nested_sub_graphs_) { + os << ", nested_sub_graph=" << nested_sub_graph->ToString(); + } + os << "}"; + return os.str(); +} + +bool SubGraphNode::operator==(const SubGraphNode& that) const { + ICHECK_EQ(inside_.end_index(), that.inside_.end_index()); + if (inside_ != that.inside_) { + return false; + } + if (nested_sub_graphs_.size() != that.nested_sub_graphs_.size()) { + return false; + } + for (size_t i = 0; i < nested_sub_graphs_.size(); ++i) { + if (*nested_sub_graphs_[i].get() != *that.nested_sub_graphs_[i].get()) { + return false; + } + } + return true; +} + +bool SubGraphNode::operator<(const SubGraphNode& that) const { + if (first_inside_index_ < that.first_inside_index_) { + return true; + } + if (that.first_inside_index_ < first_inside_index_) { + return false; + } + return inside_ < that.inside_; +} + +size_t SubGraphNode::hash() const { + size_t h = inside_.hash(); + for (const auto& nested_sub_graph : nested_sub_graphs_) { + h ^= nested_sub_graph->hash() + 0x9e3779b9 + (h << 6) + (h >> 2); + } + return h; +} + +void SubGraphNode::Init(const DataflowGraph& dataflow_graph) { + for (PostDfsIndex index = 0; index < inside_.end_index(); ++index) { + auto node = dataflow_graph.index_to_node(index); + if (inside_[index]) { + if (AnyInputOutside(node)) { + entry_.Add(index); + } + if (AnyOutputOutside(node) || node->is_external_) { + exit_.Add(index); + } + } else { + if (AnyInputInside(node)) { + output_.Add(index); + } + if (AnyOutputInside(node) && !CanInline(node->ref())) { + input_.Add(index); + } + } + } + depth_ = Depth(dataflow_graph); +} + +size_t SubGraphNode::Depth(const DataflowGraph& dataflow_graph) const { + std::unordered_map max_depths; + std::vector stack; + size_t max_depth = 0; + // All the entry nodes have max depth 0. + for (PostDfsIndex index : entry_) { + auto node = dataflow_graph.index_to_node(index); + max_depths.emplace(node, 0); + stack.push_back(node); + } + while (!stack.empty()) { + const DataflowGraph::Node* node = stack.back(); + stack.pop_back(); + size_t next_depth = max_depths[node] + 1; + if (exit_[node->index_]) { + // If this node is external then it will have no outputs but we still wish to consider + // the path to the implied output as requiring one more step. + // Otherwise we're accounting for reaching one of the external outputs belowe. + max_depth = std::max(max_depth, next_depth); + } + for (const DataflowGraph::Node* output_node : node->outputs_) { + if (!inside_[output_node->index_]) { + continue; + } + if (max_depths.count(output_node) == 0) { + max_depths.emplace(output_node, next_depth); + stack.push_back(output_node); + } else if (next_depth > max_depths[output_node]) { + // We found a deeper path to an already expanded node. We'll expand again. + max_depths[output_node] = next_depth; + stack.push_back(output_node); + } + } + } + return max_depth; +} + +/*! \brief Returns true if any (input/output) of node is (outside/inside) the sub-graph. */ +bool SubGraphNode::AnyInputOutside(const DataflowGraph::Node* node) const { + return std::any_of(node->inputs_.begin(), node->inputs_.end(), + [this](const DataflowGraph::Node* sub_node) { + return !inside_[sub_node->index_] && !CanInline(sub_node->ref()); + }); +} + +bool SubGraphNode::AnyInputInside(const DataflowGraph::Node* node) const { + return std::any_of( + node->inputs_.begin(), node->inputs_.end(), + [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; }); +} + +bool SubGraphNode::AnyOutputOutside(const DataflowGraph::Node* node) const { + return std::any_of( + node->outputs_.begin(), node->outputs_.end(), + [this](const DataflowGraph::Node* sub_node) { return !inside_[sub_node->index_]; }); +} + +bool SubGraphNode::AnyOutputInside(const DataflowGraph::Node* node) const { + return std::any_of( + node->outputs_.begin(), node->outputs_.end(), + [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; }); +} + +SubGraph::SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind, + String label, std::vector nested_sub_graphs) { + std::sort(nested_sub_graphs.begin(), nested_sub_graphs.end(), + [](const NestedSubGraph& left, const NestedSubGraph& right) { + return *left.get() < *right.get(); + }); + auto node = runtime::make_object(); + node->inside_ = std::move(inside); + node->first_inside_index_ = node->inside_.FirstInsideIndex(); + node->last_inside_index_ = node->inside_.LastInsideIndex(); + node->entry_ = IndexSet(node->inside_.end_index()); + node->exit_ = IndexSet(node->inside_.end_index()); + node->input_ = IndexSet(node->inside_.end_index()); + node->output_ = IndexSet(node->inside_.end_index()); + node->kind_ = kind; + node->label_ = std::move(label); + node->nested_sub_graphs_ = nested_sub_graphs; + node->Init(dataflow_graph); + data_ = std::move(node); +} + +SubGraph::SubGraph(const DataflowGraph& dataflow_graph) + : SubGraph(dataflow_graph, IndexSet(dataflow_graph.size())) {} + +bool SubGraph::AreDisjoint(const SubGraph& that) const { + return get()->inside_.AreDisjoint(that->inside_); +} + +namespace { +/*! \brief Returns true if an output of \p left not in \p right ultimately flows into \p right. */ +bool FlowsInto(const DataflowGraph& dataflow_graph, const SubGraph& left, const SubGraph& right) { + for (PostDfsIndex output_index : left->output_) { + if (!right->inside_[output_index] && + dataflow_graph.downstream_of(output_index).Intersects(right->entry_)) { + return true; + } + } + return false; +} +} // namespace + +bool SubGraph::AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const { + if (!get()->inside_.AreDisjoint(that->inside_)) { + // Easy rejection. + return false; + } + if (!get()->output_.Intersects(that->entry_)) { + // Not touching. + return false; + } + if (FlowsInto(dataflow_graph, *this, that) || FlowsInto(dataflow_graph, that, *this)) { + // Unioning would create a cycle. + return false; + } + return true; +} + +bool SubGraph::AreSelfContained(const SubGraph& that) const { + return get()->output_.IsSubset(that->entry_) && that->input_.IsSubset(get()->exit_); +} + +SubGraph SubGraph::DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const { + ICHECK(AreDisjoint(that)); + IndexSet inside = get()->inside_ | that->inside_; + std::vector nested_sub_graphs; + for (const auto& nested_sub_graph : get()->nested_sub_graphs_) { + nested_sub_graphs.push_back(nested_sub_graph); + } + for (const auto& nested_sub_graph : that->nested_sub_graphs_) { + auto existing_itr = std::find_if(nested_sub_graphs.begin(), nested_sub_graphs.end(), + [&nested_sub_graph](const NestedSubGraph& existing) { + return existing.TriviallyUnionable(nested_sub_graph); + }); + if (existing_itr != nested_sub_graphs.end()) { + *existing_itr = existing_itr->DisjointUnion(dataflow_graph, nested_sub_graph); + } else { + nested_sub_graphs.push_back(nested_sub_graph); + } + } + return SubGraph(dataflow_graph, std::move(inside), CombineKinds(get()->kind_, that->kind_), + UnionLabels(get()->label_, that->label_), std::move(nested_sub_graphs)); +} + +SubGraph SubGraph::WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const { + std::vector nested_sub_graphs; + nested_sub_graphs.push_back(NestedSubGraph(*this, attrs)); + return SubGraph(dataflow_graph, get()->inside_, get()->kind_, get()->label_, + std::move(nested_sub_graphs)); +} + +SubGraph SubGraph::Subst(const DataflowGraph& new_dataflow_graph, const IndexSubst& subst) const { + IndexSet new_inside = get()->inside_.Subst(new_dataflow_graph.size(), subst); + std::vector new_nested_sub_graphs; + for (const auto& nested_sub_graph : get()->nested_sub_graphs_) { + new_nested_sub_graphs.push_back(nested_sub_graph.Subst(new_dataflow_graph, subst)); + } + return SubGraph(new_dataflow_graph, std::move(new_inside), get()->kind_, get()->label_, + std::move(new_nested_sub_graphs)); +} + +/*static*/ +Expr SubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, + std::vector sub_graphs) { + // IMPORTANT: + // - All the sub-graphs will be w.r.t. the dataflow graph for the original expression. + // Each time we call Rewrite on one of those graphs the result expression will be rewritten + // from the final output back to the inputs. The inputs will then be shared with the original + // expression. Thus it is safe to iteratively rewrite all the sub-graphs without redoing the + // dataflow_graph and substituting indexes provided we work in reverse dataflow order. + // - We rely on the dataflow_graph expression reference holding the original expression alive + // so that the dataflow_graph will never contain dangling pointers (even though as per above + // we'll never dereference them). + std::sort(sub_graphs.begin(), sub_graphs.end(), [](const SubGraph& left, const SubGraph& right) { + return left->last_inside_index_ > right->last_inside_index_; + }); + Expr result = dataflow_graph.expr(); + for (const auto& sub_graph : sub_graphs) { + result = sub_graph->Rewrite(dataflow_graph, result); + } + return result; +} + +/*! + * \brief A pass which partitions (the unique) global function in the module according to the + * post-dfs indexes in \p indexes. The partitioning must respect the configuration with \p max_exits + * and \p allow_taps. + * + * Each index is also paired with a label. A non-empty label denotes the index should also be + * included in a nested sub-graph which will be extracted as a function with the label as its + * "Composite" attribute. An empty label denotes the index should go into the overall partitioned + * "Compiler" function. In this way we can simulate the usual partitioning needed by external + * codegen integrations. + * + * This function is intended to support \p SubGraph unit tests and is not used by the regular + * compilation flow. + */ +transform::Pass PartitionForTesting(Integer max_exits, Bool allow_taps, String compiler, + Array indexes, Array labels) { + auto pass_func = [=](Function function, IRModule mod, transform::PassContext ctxt) { + ICHECK(max_exits.defined() && max_exits->value >= 0); + ICHECK(allow_taps.defined()); + ICHECK(indexes.size() == labels.size()); + VLOG(1) << "Partitioning:" << std::endl << PrettyPrint(function); + DataflowGraph dataflow_graph(function); + VLOG(1) << "Dataflow graph is:" << std::endl << dataflow_graph.indexed_graph().ToString(); + + // Collect the 'inside' indexes and any nested sub-graph indexes and labels. + std::vector node_indexes; + std::unordered_map> nested_sub_graph_indexes; + node_indexes.reserve(indexes.size()); + for (size_t i = 0; i < indexes.size(); ++i) { + const Integer& index = indexes[i]; + ICHECK_GE(index->value, 0); + ICHECK_LT(index->value, dataflow_graph.size()); + auto index_int = static_cast(index->value); + node_indexes.push_back(index_int); + const String& label = labels[i]; + if (!label.empty()) { + nested_sub_graph_indexes[label].push_back(index_int); + } + } + + // Build the nested sub-graphs representing the "Composite" functions (if any). + std::vector nested_sub_graphs; + for (const auto& kv : nested_sub_graph_indexes) { + FunctionAttrsMap composite_attrs; + composite_attrs.Set("Composite", kv.first); + nested_sub_graphs.emplace_back( + SubGraph(dataflow_graph, IndexSet(dataflow_graph.size(), kv.second)), composite_attrs); + } + + // Build the overall sub-graph, which will include any "Composite" functions as + // well as any nodes without a label. + IndexSet inside(dataflow_graph.size(), node_indexes); + OpPatternKind kind; + String label; + std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + SubGraph sub_graph(dataflow_graph, inside, kind, label, std::move(nested_sub_graphs)); + + // Push the overall sub-graph into the final "Compiler" function. + FunctionAttrsMap compiler_attrs; + compiler_attrs.Set("Compiler", compiler); + NestedSubGraph overall_nested_sub_graph(sub_graph, compiler_attrs); + SubGraph overall_sub_graph(dataflow_graph, inside, kind, label, {overall_nested_sub_graph}); + + // Check the sub-graph is valid. + SubGraphConfig config; + config.max_exits = static_cast(max_exits->value); + config.allow_taps = allow_taps; + if (overall_sub_graph->IsValid(dataflow_graph, config)) { + VLOG(1) << "Sub-graph " << overall_sub_graph->ToString() << " is considered valid"; + } else { + VLOG(1) << "Sub-graph " << overall_sub_graph->ToString() + << " is NOT considered valid, not partitioning"; + return function; + } + + // Do the partitioning. + Function result = Downcast(overall_sub_graph->Rewrite(dataflow_graph, function)); + VLOG(1) << "Extracted as:" << std::endl << PrettyPrint(result); + + return result; + }; + return transform::CreateFunctionPass(pass_func, /*opt_level=*/0, "PartitionForTesting", {}); +} + +TVM_REGISTER_GLOBAL("relay.collage.PartitionForTesting").set_body_typed(PartitionForTesting); + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/sub_graph.h b/src/relay/collage/sub_graph.h new file mode 100644 index 0000000000000..f7d4354d5483c --- /dev/null +++ b/src/relay/collage/sub_graph.h @@ -0,0 +1,452 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/sub_graph.h + * \brief Represents a sub-graph of an overall Relay expression. + */ + +#ifndef TVM_RELAY_COLLAGE_SUB_GRAPH_H_ +#define TVM_RELAY_COLLAGE_SUB_GRAPH_H_ + +#include +#include + +#include +#include +#include +#include + +#include "../ir/dataflow_matcher_impl.h" +#include "../ir/indexed_graph.h" +#include "./dataflow_graph.h" +#include "./index_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! \brief Returns operator pattern kind as single-letter string. */ +std::string KindToString(OpPatternKind kind); + +/*! + * \brief Returns a kind and label for the single \p sub_expr, ignoring its nested sub expressions. + */ +std::pair SubExprKindAndLabel(const Expr& sub_expr); + +/*! + * \brief Returns a kind and label for all the nodes in \p inside. + */ +std::pair SubGraphKindAndLabel(const DataflowGraph& dataflow_graph, + const IndexSet& inside); + +/*! + * \brief Returns the index set representing all the sub-expression matched by \p matcher. + */ +IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher); + +/*! + * \brief Configuration controlling which sub-graphs are considered valid. + */ +struct SubGraphConfig { + /*! \brief Maximum number of exit nodes in the sub-graph, or zero if no limit. */ + size_t max_exits = 0; + /*! + * \brief Whether a node inside the sub-graph may flow to nodes both inside and outside + * the sub-graph (which we call a 'tap'). Note that it is still possible to have multiple outputs + * even with this flag false. + */ + bool allow_taps = false; + /*! + * \brief Maximum allowed sub-graph depth, or zero if no-limit. + */ + size_t max_depth = 0; + + std::string ToString() const; +}; + +class SubGraph; +using FunctionAttrsMap = Map; + +/*! + * \brief A nested sub-graph is a sub-graph which is to be nested inside a function as part of some + * enclosing sub-graph. + * + * Extraction yields a function with input nodes replaced by parameters and exit nodes in the + * function result. Rewriting replaces the sub-graph with a call to that function, and all + * outputs with (projections from) the call result. + * + * (Note that it's tempting to move attrs_ into \p SubGraphNode and thus avoid this class. + * However we found the implementation was easier to understand in this form since it makes + * the result of \p Extract unambiguous.) + */ +class NestedSubGraphNode : public Object { + public: + /*! \brief The nested sub-graph. */ + ObjectRef /* actually SubGraph */ sub_graph_obj_; + /*! \brief Attributes (possibly empty) to attach to the extracted function. */ + FunctionAttrsMap attrs_; + + void VisitAttrs(AttrVisitor* v); + + SubGraph sub_graph() const; + + bool operator==(const NestedSubGraphNode& that) const; + bool operator!=(const NestedSubGraphNode& that) const { return !(*this == that); } + bool operator<(const NestedSubGraphNode& that) const; + size_t hash() const; + + std::string ToString() const; + + /*! + * \brief Returns the function representing this nested sub-graph within the overall expression + * represented by \p dataflow_graph: + * - All sub-graph inputs become parameters. + * - All sub-graph outputs become function results (either directly or as a field in a tuple). + * - The function has attrs_ for attributes (which may be empty). + * - The function body accounts for any rewrites implied by the nested sub-graph. + */ + Function Extract(const DataflowGraph& dataflow_graph) const; + + /*! + * \brief Returns \p expr rewritten to encode the partitioning implied by this nested sub-graph. + * + * It is valid for \p expr to not be the same as \p dataflow_graph.expr(), however all nodes + * inside this nested sub-graph must correspond to nodes shared between \p dataflow_graph.expr() + * and \p expr. See \p SubGraph::ParallelRewrite below. + */ + Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const; + + static constexpr const char* _type_key = "relay.collage.NestedSubGraph"; + TVM_DECLARE_FINAL_OBJECT_INFO(NestedSubGraphNode, Object); +}; + +class NestedSubGraph : public ObjectRef { + public: + NestedSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs); + + /*! + * \brief Returns copy of this nested sub-graph with all indexes substituted according to + * \p subst, whose range is w.r.t. \p new_dataflow_graph. + */ + NestedSubGraph Subst(const DataflowGraph& new_dataflow_graph, + const std::unordered_map& subst) const; + + /*! + * \brief Returns true if this can be safely unioned. + */ + bool TriviallyUnionable(const NestedSubGraph& that) const; + + /*! + * \brief Returns the disjoint union of this and \p that nested sub-graphs, which must agree on + * their attributes. + */ + NestedSubGraph DisjointUnion(const DataflowGraph& dataflow_graph, + const NestedSubGraph& that) const; + + /*! + * \brief Returns \p expr rewritten according to all the given nested sub-graphs. The + * nested sub-graphs can be given in any order, but must be disjoint. + * + * It is valid for \p expr to not be the same as \p dataflow_graph.expr(), however all nodes + * inside the nested sub-graphs must correspond to nodes shared between \p dataflow_graph.expr() + * and \p expr. See \p SubGraph::ParallelRewrite below. + */ + static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr, + std::vector nested_sub_graphs); + + TVM_DEFINE_OBJECT_REF_METHODS(NestedSubGraph, ObjectRef, NestedSubGraphNode); +}; + +using NestedSubGraphs = Array; + +/*! + * \brief A compact representation of a sub-graph within an (implied) overall Relay expression. + * + * Sub-graphs can be used to represent partitions/kernels/composite functions without having to + * pay the cost of constructing or rewriting any expressions. We also allow 'extracting' a + * function to use for measuring a partition/kernel's latency independently from 'rewriting' + * the overall Relay expression since only a tiny subset of candidate partitions will end up being + * needed after Collage has completed its search. + * + * We expect O(thousands) of sub-graphs to be in flight while processing a given model, so we are + * mindful of space overhead. + * + * A sub-graph classifies every dataflow node of the overall expression as either 'inside' or + * 'outside' the sub-graph. Obviously not all such divisions make sense, for example it is not + * valid for an inside node to feed into another inside node via outside nodes. We provide the + * \p IsValid method to check for validity, and \p SubGraphConfig to control which validity rules + * apply (such as maximum depth). + * + * We generally work with the \p DataflowGraph representation of the overall Relay expression + * rather than the expression itself. We use the post-dfs visit index to uniquely refer to + * expression nodes. + * + * As well as 'inside' and 'outside' we have four other flavors of dataflow nodes, all uniquely + * determined from the 'inside' nodes: + * - 'entry' nodes are those inside with at least one dataflow input outside. + * - 'exit' nodes are those inside with at least one dataflow output outside, or which + * are considered 'external' in the underlying dataflow graph (eg because they represent + * the result of the overall function). + * - 'input' nodes are those outside with at least one dataflow output inside. + * - 'output' nodes are those outside with at least one dataflow input inside. + * Index sets for these are cached with the sub-graph for performance. + * + * It is valid to have multiple entry nodes (we can bind a parameter for each). It may be valid to + * have multiple exit nodes (we can build a tuple of all such). It may be valid to have exit nodes + * which also contribute to other inside nodes (ie represent a 'tap' on an intermediate result). + * + * Sub-graphs are closed under: + * - Disjoint union. + * - Wrapping by a function with given attributes (see \p NestedSubGraph above). This can be used + * to encode "Composite" functions, or to represent a candidate kernel within a "Primitive" + * function. (By combining 'wrapping' with 'union' we can encode, eg, 'this sub-graph should + * be placed inside a primitive function which itself may have calls to composite functions). + * - Substitution, which allows a sub-graph w.r.t. one dataflow graph to be transformed to + * match some other (typically smaller) dataflow graph. + * + * See the subclasses of \p PartitionRule for how sub-graphs are built and combined during Collage + * search. + * + * To support some of the \p OpPatternKind-based fusion rule processing we give sub-graphs + * a kind, which is generally the maximum of the kinds of all the operator calls appearing + * inside it. We also given sub-graphs a (not necessarily unique) label to help debugging + * and guide the selection of global symbol names. + */ +class SubGraphNode : public Object { + public: + /*! + * \brief Which sub-expressions are inside the sub-graph (using their post-dfs indexes w.r.t. + * the implied DataflowGraph). + */ + IndexSet inside_; + + /*! + * \brief Index of first and last inside nodes. + * + * Cached for performance, uniquely determined by inside_. + */ + PostDfsIndex first_inside_index_ = 0; + PostDfsIndex last_inside_index_ = 0; + + /*! + * \brief Which sub-expressions are entry/exit/input/output for this sub-graph. + * + * Cached for performance, uniquely determined by inside_. + */ + IndexSet entry_; + IndexSet exit_; + IndexSet input_; + IndexSet output_; + + /*! + * \brief Maximum depth of any dataflow path from an entry to an output sub-expression. + * + * Cached for performance, uniquely determined by inside_. + */ + size_t depth_ = 0; + + /*! + * \brief The \p OpPatternKind summarizing the input/output behavior of the sub-graph. + * + * A sub-graph consisting of a single Relay expression node is given kind: + * - For Call to a Relay operator, the "TOpPattern" attribute of that operator (provided the + * call does not involve data-dependent dynamic shapes). + * - For Call to Relay Function, the "TOpPattern" attribute of the function (provided it has + * that attribute) + * - For Constants, \p kElemWise. + * - For Tuple and tuple projections, \p kInjective (provided all tuple fields are of tensor + * type) + * - All other nodes \p kOpaque. + * Sub-graphs with more than one node have the maximum of the kind of each node. + * + * Cached for performance, uniquely determined by inside_. + */ + OpPatternKind kind_ = kOpaque; + + /*! + * \brief A label for the sub-graph. Not guaranteed to be unique, but is a human-readable summary + * of the sub-graph which can help with debugging and guide the selection of global symbol names. + */ + String label_; + + /*! + * \brief Nested sub-graphs of this sub-graph which must be represented by functions. These must + * be disjoint, but it's ok for this sub-graph to have nodes not inside any nested sub-graph. + */ + NestedSubGraphs nested_sub_graphs_; + + void VisitAttrs(AttrVisitor* v); + + // TODO(mbs): 'Anchor nodes' and rules for unioning them. + // In FuseOps it's just the unique kEWiseFusable node, if any. + // I'd like to allow writing vertical fusion rules, eg if two candidates are directly + // connected and have nn.conv2d anchors allow their join. + // I'd also like to allow horizontal fusion rules, eg if two candidates are not directly + // connected but could be joined without producing invalid (eg cyclic) and have nn.conv2d anchors + // then do so. Come back to this. + + /*! \brief Number of nodes in overall dataflow graph. */ + size_t overall_size() const { return inside_.end_index(); } + + bool IsEmpty() const { return inside_.IsZero(); } + + /*! \brief Number of nodes in sub-graph. */ + size_t Size() const { return inside_.PopCount(); } + + /*! + * \brief Returns the dataflow nodes downstream of all exit nodes. + */ + IndexSet Downstream(const DataflowGraph& dataflow_graph) const; + + /*! + * \brief Returns true if this sub-graph is valid. Ie: + * - no output of the sub-graph can flow to any input of the sub-graph (otherwise we'd end up + * with a dataflow cycle when we partition). + * - all inputs and outputs of the sub-graph are in the same scope, ie not separated by + * control flow (otherwise there'd be no consistent program point at which to eval the + * partitioned function). + * - no more than config.max_outputs outputs are required. + * - if config.allow_taps is false, no inside node has outputs to nodes both inside and + * outside the sub-graph. + */ + bool IsValid(const DataflowGraph& dataflow_graph, const SubGraphConfig& config) const; + + /*! + * \brief Returns this sub-graph extracted as a stand-alone function. The function will have + * no attributes, and is suitable for building and profiling by the \p CostEstimator. + */ + Function ExtractAsFunction(const DataflowGraph& dataflow_graph) const; + + /*! + * \brief Returns \p expr rewritten to encode the partitioning implied by this sub-graph. + * + * It is valid for \p expr to not be the same as \p dataflow_graph.expr(), however all nodes + * inside this sub-graph must correspond to nodes shared between \p dataflow_graph.expr() and + * \p expr. See \p SubGraph::ParallelRewrite below. + */ + Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const; + + std::string ToString() const; + + bool operator==(const SubGraphNode& that) const; + bool operator!=(const SubGraphNode& that) const { return !(*this == that); } + bool operator<(const SubGraphNode& that) const; + size_t hash() const; + + private: + /*! \brief Initialize the entry/exit/input/output sets given the inside and \p dataflow_graph. */ + void Init(const DataflowGraph& dataflow_graph); + + /*! \brief Calculates and returns the maximum path depth. */ + size_t Depth(const DataflowGraph& dataflow_graph) const; + + /*! \brief Returns true if any (input/output) of node is (outside/inside) the sub-graph. */ + bool AnyInputOutside(const DataflowGraph::Node* node) const; + bool AnyInputInside(const DataflowGraph::Node* node) const; + bool AnyOutputOutside(const DataflowGraph::Node* node) const; + bool AnyOutputInside(const DataflowGraph::Node* node) const; + + public: + static constexpr const char* _type_key = "relay.collage.SubGraph"; + TVM_DECLARE_FINAL_OBJECT_INFO(SubGraphNode, Object); + + friend class SubGraph; +}; + +class SubGraph : public ObjectRef { + public: + /*! \brief Primitive constructor. The following constructors are generally more convenient. */ + SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind = kOpaque, + String label = {}, std::vector nested_sub_graphs = {}); + + /*! \brief Constructs the empty sub-graph for \p dataflow_graph. */ + explicit SubGraph(const DataflowGraph& dataflow_graph); + + /*! \brief Returns true if this and that are disjoint. */ + bool AreDisjoint(const SubGraph& that) const; + + /*! + * \brief Returns true if: + * - \p this and \p that are disjoint, and + * - an output node of \p this coincides with an entry node of \p that, and + * - \p this and \p that are not obviously invalid after \p DisjointUnion + * (eg because such a sub-graph would produce a cycle). + * Note however that the \p DisjointUnion may not necessarily be valid even with the above + * checks. + */ + bool AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const; + + /*! + * \brief Returns true if: + * - all the outputs of \p this are entries for \p that, and + * - all the inputs of \p that are exits for \p this. + */ + bool AreSelfContained(const SubGraph& that) const; + + /*! + * \brief Returns disjoint union of this and \p that sub-graphs. The result may not be valid. + */ + SubGraph DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const; + + /*! + * \brief Returns copy of this sub-graph with all nodes placed inside a nested sub-graph with + * given attributes. + */ + SubGraph WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const; + + /*! + * \brief Returns copy of this sub-graph with all indexes substituted according to \p subst, + * whose range is w.r.t. \p new_dataflow_graph. + */ + SubGraph Subst(const DataflowGraph& new_dataflow_graph, + const std::unordered_map& subst) const; + + /*! + * \brief Returns the root expression of \p dataflow_graph rewritten according to all the + * given sub-graphs. The sub-graphs can be given in any order, but must be disjoint. + */ + static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, + std::vector sub_graphs); + + TVM_DEFINE_OBJECT_REF_METHODS(SubGraph, ObjectRef, SubGraphNode); +}; + +struct SubGraphEqual { + bool operator()(const SubGraph& left, const SubGraph& right) const { + return *left.get() == *right.get(); + } +}; + +struct SubGraphHash { + size_t operator()(const SubGraph& sub_graph) const { return sub_graph->hash(); } +}; + +/*! + * \brief Pass to partition every global function according to the post-dfs indexes + * given in an array. Visible for testing from Python only, would never make sense to use + * as a generic pass! + */ +tvm::transform::Pass PartitionOnIndexesForTesting(Array indexes); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_SUB_GRAPH_H_ diff --git a/src/relay/collage/utils.cc b/src/relay/collage/utils.cc new file mode 100644 index 0000000000000..03af980e8c1d3 --- /dev/null +++ b/src/relay/collage/utils.cc @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/utils.cc + * \brief Misc helpers. + */ + +#include "./utils.h" + +#include "../../support/scalars.h" +#include "../op/memory/device_copy.h" + +namespace tvm { +namespace relay { +namespace collage { + +String GetSpecName(const Target& target) { + if (TargetKind::GetAttrMap(tvm::attr::kIsExternalCodegen).get(target->kind, Bool(false))) { + return target->kind->name; + } else { + return std::string(kTVMSpecNamePrefix) + target->kind->name; + } +} + +String UnionLabels(String left, String right) { + if (left.empty()) { + return right; + } + if (right.empty()) { + return left; + } + return left + "+" + right; +} + +String NestLabels(String left, String right) { + if (left.empty()) { + return right; + } + if (right.empty()) { + return left; + } + if (right.size() > left.size()) { + std::string right_str = right; + if (right_str.substr(0, left.size()) == left) { + return right; + } + } + return left + "." + right; +} + +std::string KindToString(OpPatternKind kind) { + switch (kind) { + case kElemWise: + return "E"; + case kBroadcast: + return "B"; + case kInjective: + return "I"; + case kCommReduce: + return "R"; + case kOutEWiseFusable: + return "A"; + case kTuple: + return "T"; + case kOpaque: + return "O"; + } + return "?"; +} + +OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right) { + return std::max(left, right); +} + +bool CanInline(const Expr& expr) { + if (expr.as() || expr.as() || expr.as()) { + return true; + } + if (const auto* constant_node = expr.as()) { + return support::IsSimpleScalar(constant_node); + } + return false; +} + +bool IsSpecialOp(const OpNode* op_node) { + auto op = GetRef(op_node); + static auto fnoncomputational = Op::GetAttrMap("TNonComputational"); + if (fnoncomputational.count(op) && fnoncomputational[op]) { + // Operator has been marked as non-computational. + return true; + } + // TODO(mbs): This is incomplete. + static auto shape_of_op_ = Op::Get("shape_of"); + static auto vm_shape_of_op_ = Op::Get("vm.shape_of"); + if (op == DeviceCopyOp() || op == shape_of_op_ || op == vm_shape_of_op_) { + // Operator is compiled away by the VM compilation flow. + return true; + } + return false; +} + +bool MustBeLowered(const Expr& expr) { + if (const auto* call_node = expr.as()) { + if (const auto* function_node = call_node->op.as()) { + if (function_node->HasNonzeroAttr(attr::kPrimitive)) { + // We've already committed to this call being to one or more operators which must be + // lowered. + return true; + } + } else if (const auto* op_node = call_node->op.as()) { + if (!IsSpecialOp(op_node)) { + // The VM compilation path won't rewrite this call. + return true; + } + } + } + return false; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/utils.h b/src/relay/collage/utils.h new file mode 100644 index 0000000000000..4c0493cdd675c --- /dev/null +++ b/src/relay/collage/utils.h @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/utils.h + * \brief Misc helpers. + */ + +#ifndef TVM_RELAY_COLLAGE_UTILS_H_ +#define TVM_RELAY_COLLAGE_UTILS_H_ + +#include +#include +#include +#include + +#include + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Distinguished partition spec names. + */ +constexpr const char* kTVMSpecNamePrefix = "tvm_"; +constexpr const char* kHostSpecName = "host"; + +/*! + * \brief Returns the partition spec name to use for \p target. For external codegen targets the + * spec name is just the target kind name. For TVM native targets the spec name is of the form + * "tvm_". + */ +String GetSpecName(const Target& target); + +/*! \brief Returns \p "+". */ +String UnionLabels(String left, String right); + +/*! \brief Returns \p ".". */ +String NestLabels(String outer, String inner); + +/*! \brief Returns abbreviation for \p kind. */ +std::string KindToString(OpPatternKind kind); + +/*! \brief Returns maximum of \p left and \p right. */ +OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right); + +/*! + * \brief Returns true if \p expr can be safely inlined in body of function extracted + * from sub-graph, even if \p expr was not technically matched by the pattern which produced + * the sub-graph. + */ +bool CanInline(const Expr& expr); + +/*! + * \brief Returns true if \p op_node can be directly handled by the VM. + */ +bool IsSpecialOp(const OpNode* op_node); + +/*! + * \brief Return true if the Relay expression node given by \p expr cannot be evaluated by + * the VM and must end up in a kernel. + */ +bool MustBeLowered(const Expr& expr); + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_UTILS_H_ diff --git a/tests/python/relay/collage/test_sub_graph.py b/tests/python/relay/collage/test_sub_graph.py new file mode 100644 index 0000000000000..de2d974bf934a --- /dev/null +++ b/tests/python/relay/collage/test_sub_graph.py @@ -0,0 +1,387 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import logging +import tvm.testing + +logging.basicConfig(level=logging.INFO) + +partition_for_testing = tvm._ffi.get_global_func("relay.collage.PartitionForTesting") + + +def print_with_indexes(mod): + mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod) + print(mod) + + +def run(in_mod, expected_mod, max_outputs, allow_taps, compiler, map): + expected_mod = tvm.relay.transform.InferType()(expected_mod) + + in_mod = tvm.relay.transform.InferType()(in_mod) + in_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(in_mod) + + indexes = [i for l, iss in map.items() for i in iss] + labels = [l for l, iss in map.items() for i in iss] + actual_mod = partition_for_testing(max_outputs, allow_taps, compiler, indexes, labels)(in_mod) + + if not tvm.ir.structural_equal(actual_mod, expected_mod, True): + # Print everything in full so we can see what's going on when things fail. + print("Input module:") + print(in_mod) + print("Expected module:") + print(expected_mod) + print("Actual module:") + print(actual_mod) + # Assert again so as to see the actual disagreeing sub-expressions. + tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True) + + +def test_single_op(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); + %1 = add(%c, %d); // node 7 + subtract(%0, %1) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); + %1 = (fn(%x, %y, Compiler="foo") { add(%x, %y) })(%c, %d); + subtract(%0, %1) + } + """ + ) + + run(input(), expected(), 1, False, "foo", {"": [7]}) + + +def test_multi_output(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); // node 6 + %1 = add(%c, %d); // node 7 + subtract(%0, %1) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32], + %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) { + %0 = (fn(%w, %x, %y, %z, Compiler="foo") { (add(%y, %z), add(%w, %x)) })(%c, %d, %a, %b); + %1 = %0.0; + %2 = %0.1; + subtract(%1, %2) + } + """ + ) + + # No rewrite since 2 outputs + run(input(), input(), 1, False, "foo", {"": [6, 7]}) + # Rewrite + run(input(), expected(), 2, False, "foo", {"": [6, 7]}) + + +def test_classic_conv2d_add_relu(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32], + %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) { + %0 = nn.conv2d(%a, %b); // node 8 + %1 = add(%0, %c); // node 9 + %2 = nn.relu(%1); // node 10 + subtract(%2, %d) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32], + %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) { + %2 = (fn(%x, %y, %z, Compiler="foo") { + %0 = nn.conv2d(%x, %y); + %1 = add(%0, %z); + nn.relu(%1) + })(%a, %b, %c); + subtract(%2, %d) + } + """ + ) + + run(input(), expected(), 1, False, "foo", {"": [8, 9, 10]}) + + +def test_diamond_single_output(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5 + %1 = nn.relu(%0); // node 6 + %2 = nn.relu(%1); // node 7 + %3 = nn.leaky_relu(%0, alpha=0f); // node 9 + add(%2, %3) // node 10 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Compiler="foo") { + %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]); + %1 = nn.relu(%0); + %2 = nn.relu(%1); + %3 = nn.leaky_relu(%0, alpha=0f); + add(%2, %3) + })(%a, %b) + } + """ + ) + + run(input(), expected(), 1, False, "foo", {"": [5, 6, 7, 9, 10]}) + + +def test_diamond_multi_output(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5 + %1 = nn.relu(%0); // node 6 + %2 = nn.relu(%1); // node 7 + %3 = nn.leaky_relu(%0, alpha=0f); // node 9 + add(%2, %3) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %4 = (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Compiler="foo") { + %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]); + %1 = nn.relu(%0); + %2 = nn.relu(%1); + %3 = nn.leaky_relu(%0, alpha=0f); + (%2, %3) + })(%a, %b); + %5 = %4.0; + %6 = %4.1; + add(%5, %6) + } + """ + ) + + run(input(), expected(), 2, False, "foo", {"": [5, 6, 7, 9]}) + + +def test_with_tap(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5 + %1 = nn.relu(%0); // node 6 + add(%1, %0) + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) { + %2 = (fn (%x, %y, Compiler="foo") { + %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]); + %1 = nn.relu(%0); + (%0, %1) + })(%a, %b); + %3 = %2.1; + %4 = %2.0; + add(%3, %4) + } + """ + ) + + # No rewrite since has tap + run(input(), input(), 2, False, "foo", {"": [5, 6]}) + # Rewrite + run(input(), expected(), 2, True, "foo", {"": [5, 6]}) + + +def test_no_cycles(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) { + %0 = add(%a, %b); // node 3 + %1 = add(%0, %b); + add(%1, %b) // node 5 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) { + (fn(%x, %y, Compiler="foo") { + %0 = add(%x, %y); + %1 = add(%0, %y); + add(%1, %y) + })(%a, %b) + } + """ + ) + + # No rewrite since would create cycle + run(input(), input(), 2, False, "foo", {"": [3, 5]}) + # No cycle + run(input(), expected(), 2, False, "foo", {"": [3, 4, 5]}) + + +def test_labels_direct_connection(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); // node 3 + %1 = nn.relu(%0); // node 4 + %2 = nn.relu(%1); // node 5 + %3 = nn.relu(%1); // node 6 + %4 = add(%2, %3); // node 7 + %5 = nn.relu(%4); // node 8 + %6 = nn.relu(%4); // node 9 + %7 = add(%5, %6); // node 10 + nn.relu(%7) // node 11 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + (fn(%aa: Tensor[(5, 7), float32], Compiler="foo") { + %0 = nn.relu(%aa); + %4 = (fn(%y, Composite="a") { + %1 = nn.relu(%y); + %2 = nn.relu(%1); + %3 = nn.relu(%1); + add(%2, %3) + })(%0); + %7 = (fn(%z, Composite="b") { + %5 = nn.relu(%z); + %6 = nn.relu(%z); + add(%5, %6) + })(%4); + nn.relu(%7) + })(%a) + } + """ + ) + + run(input(), expected(), 1, False, "foo", {"": [3, 11], "a": [4, 5, 6, 7], "b": [8, 9, 10]}) + + +def test_labels_nested_tap(): + def input(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); // node 3 + %1 = nn.relu(%0); // node 4 + %2 = nn.relu(%1); // node 5 + %3 = nn.relu(%1); // node 6 + %4 = add(%2, %3); // node 7 + %5 = nn.relu(%4); // node 8 + %6 = nn.relu(%4); // node 9 + %7 = add(%5, %6); // node 10 + add(%2, %7) // node 11 + } + """ + ) + + def expected(): + return tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a: Tensor[(5, 7), float32]) { + %0 = nn.relu(%a); + %9 = (fn(%x: Tensor[(5, 7), float32], Compiler="foo") { + %5 = (fn(%y, Composite="a") { + %1 = nn.relu(%y); + %2 = nn.relu(%1); + %3 = nn.relu(%1); + %4 = add(%2, %3); + (%2, %4) + })(%x); + %8 = (fn(%z, Composite="b") { + %6 = nn.relu(%z); + %7 = nn.relu(%z); + add(%6, %7) + })(%5.1); + (%5.0, %8) + })(%0); + add(%9.0, %9.1) + } + """ + ) + + run(input(), expected(), 2, True, "foo", {"a": [4, 5, 6, 7], "b": [8, 9, 10]}) + + +if __name__ == "__main__": + tvm.testing.main() From ae72e7e65384c392a110f703676ba88b18b47c1a Mon Sep 17 00:00:00 2001 From: Everton Constantino Date: Mon, 11 Jul 2022 18:03:14 -0300 Subject: [PATCH 088/111] Fix node.func to node.funcs on parser.py (#12053) --- python/tvm/script/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index 0932e717bbec2..7f5b3e86f3137 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -361,7 +361,7 @@ def B(...): """ if len(node.funcs) == 1: return self.transform(next(iter(node.funcs.values()))) - elif len(node.func) == 0: + elif len(node.funcs) == 0: self.report_error( "You must supply at least one class or function definition", node.span ) From d27f853ed5f8b99f57d9aa04fa060920e1422429 Mon Sep 17 00:00:00 2001 From: alter-xp Date: Tue, 12 Jul 2022 05:32:13 +0800 Subject: [PATCH 089/111] [ci][docker] fix the path of custom toolchain in ci_qemu for csinn2 (#11905) --- docker/Dockerfile.ci_qemu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu index eda64f1bc5901..63089f3d65f25 100644 --- a/docker/Dockerfile.ci_qemu +++ b/docker/Dockerfile.ci_qemu @@ -116,5 +116,5 @@ RUN bash /install/ubuntu_download_csinn2_compute_lib.sh # Update PATH ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH -ENV PATH /opt/csi-nn2/csi-nn2/tools/gcc-toolchain/bin:$PATH -ENV PATH /opt/csi-nn2/csi-nn2/tools/qemu/bin:$PATH +ENV PATH /opt/csi-nn2/tools/gcc-toolchain/bin:$PATH +ENV PATH /opt/csi-nn2/tools/qemu/bin:$PATH From 7baf4be2d473274b14cf5fa0428ba0cc18ec589a Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Mon, 11 Jul 2022 22:37:20 +0100 Subject: [PATCH 090/111] [relay] Changed 'name' field to 'registry_name' for Executor and Runtime (#10466) * [relay] Changed Executor and Runtime 'name' field to 'registry_name' Changed 'name' field to 'registry_name' for Executor and Runtime python wrappers as it clashed with tvm object attribute 'name' which made the latter inaccessible from Python Change-Id: I917755753549edfe1d3090ca9ca4512de552c4bd changed name to registry_name Change-Id: I9feb5b33b7b6f6f8421902e5721167f585cc4193 * more fixed unit tests Change-Id: Ie2e96297fda119e1b726b196a59deae95b263a07 * typo fixed Change-Id: Id579c50ab58dfb25fa18436265e0701ebbd9d554 * renamed registry_name to flag_registry_name Change-Id: Iabbd81069959f05c073f9dbc8d10fb31dd05f7a3 * bugfix --- .../how_to/work_with_microtvm/micro_train.py | 2 +- python/tvm/driver/tvmc/registry.py | 22 +++++++++++-------- python/tvm/relay/backend/executor.py | 2 +- python/tvm/relay/backend/runtime.py | 2 +- python/tvm/relay/build_module.py | 6 ++--- src/relay/backend/executor.cc | 1 + tests/python/relay/test_build_module.py | 13 +++++++---- tests/python/relay/test_executor.py | 6 ++--- .../test_micro_model_library_format.py | 4 ++-- 9 files changed, 34 insertions(+), 24 deletions(-) diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py index b1c835d4102a2..f75c0b05eb1d9 100644 --- a/gallery/how_to/work_with_microtvm/micro_train.py +++ b/gallery/how_to/work_with_microtvm/micro_train.py @@ -606,7 +606,7 @@ def representative_dataset(): assert os.path.isfile(f"{FOLDER}/models/project.zip") # Assert MLF file was correctly generated -assert str(mod.executor) == "aot" +assert mod.executor.name == "aot" # Remove the temporary folder we generated at the beginning shutil.rmtree(FOLDER) diff --git a/python/tvm/driver/tvmc/registry.py b/python/tvm/driver/tvmc/registry.py index 334aa1b61be80..c2e74eb1935ec 100644 --- a/python/tvm/driver/tvmc/registry.py +++ b/python/tvm/driver/tvmc/registry.py @@ -28,25 +28,29 @@ def _generate_registry_option_args(parser, registry, name): - target_group = parser.add_argument_group(f"{registry.name} {name}") + target_group = parser.add_argument_group(f"{registry.flag_registry_name} {name}") for option_name, option_type in registry.list_registered_options(name).items(): if option_type in INTERNAL_TO_NATIVE_TYPE: target_group.add_argument( - f"--{registry.name}-{name}-{option_name}", + f"--{registry.flag_registry_name}-{name}-{option_name}", type=INTERNAL_TO_NATIVE_TYPE[option_type], - help=f"{registry.name.title()} {name} {option_name}{INTERNAL_TO_HELP[option_type]}", + help=( + f"{registry.flag_registry_name.title()} " + + "{name} {option_name}{INTERNAL_TO_HELP[option_type]}" + ), ) def generate_registry_args(parser, registry, default=None): """Walks through the given registry and generates arguments for each of the available options""" parser.add_argument( - f"--{registry.name}", - help=f"{registry.name.title()} to compile the model with", + f"--{registry.flag_registry_name}", + help=f"{registry.flag_registry_name.title()} to compile the model with", required=False, default=default, ) names = registry.list_registered() + for name in names: _generate_registry_option_args(parser, registry, name) @@ -55,7 +59,7 @@ def _reconstruct_registry_options(args, registry, name): options = {} for option, option_type in registry.list_registered_options(name).items(): if option_type in INTERNAL_TO_NATIVE_TYPE: - var_name = f"{registry.name}_{name}_{option.replace('-', '_')}" + var_name = f"{registry.flag_registry_name}_{name}_{option.replace('-', '_')}" option_value = getattr(args, var_name) if option_value is not None: options[option] = option_value @@ -65,12 +69,12 @@ def _reconstruct_registry_options(args, registry, name): def reconstruct_registry_entity(args, registry): """Reconstructs an entity from arguments generated from a registry""" possible_names = registry.list_registered() - name = getattr(args, registry.name) + name = getattr(args, registry.flag_registry_name) if name is None: return None if name not in possible_names: - raise TVMCException(f'{registry.name.title()} "{name}" is not defined') + raise TVMCException(f'{registry.flag_registry_name.title()} "{name}" is not defined') reconstructed = { possible_name: _reconstruct_registry_options(args, registry, possible_name) @@ -81,7 +85,7 @@ def reconstruct_registry_entity(args, registry): if possible_name != name and reconstructed[possible_name]: first_option = list(reconstructed[possible_name])[0] raise TVMCException( - f"Passed --{registry.name}-{possible_name}-{first_option} " + f"Passed --{registry.flag_registry_name}-{possible_name}-{first_option} " f"but did not specify {possible_name} executor" ) diff --git a/python/tvm/relay/backend/executor.py b/python/tvm/relay/backend/executor.py index 9164d6a75ea3c..ac5e5bf1f8293 100644 --- a/python/tvm/relay/backend/executor.py +++ b/python/tvm/relay/backend/executor.py @@ -27,7 +27,7 @@ class Executor(Object): """Executor configuration""" - name = "executor" + flag_registry_name = "executor" def __init__(self, name, options=None) -> None: if options is None: diff --git a/python/tvm/relay/backend/runtime.py b/python/tvm/relay/backend/runtime.py index f2fd69a0f5477..b93c8076e698e 100644 --- a/python/tvm/relay/backend/runtime.py +++ b/python/tvm/relay/backend/runtime.py @@ -27,7 +27,7 @@ class Runtime(Object): """Runtime configuration""" - name = "runtime" + flag_registry_name = "runtime" def __init__(self, name, options=None) -> None: if options is None: diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 32ad6c70794c7..f3de1a0856927 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -173,7 +173,7 @@ def build( # Get artifacts mod = self.get_module() params = self.get_params() - executor_config = self.get_graph_json() if str(executor) == "graph" else None + executor_config = self.get_graph_json() if executor.name == "graph" else None return executor_config, mod, params @@ -450,7 +450,7 @@ def build( lowered_ir_mods = bld_mod.get_irmodule() executor_codegen_metadata = bld_mod.get_executor_codegen_metadata() - if str(executor) == "aot": + if executor.name == "aot": executor_factory = _executor_factory.AOTExecutorFactoryModule( ir_mod, lowered_ir_mods, @@ -464,7 +464,7 @@ def build( executor_codegen_metadata, devices, ) - elif str(executor) == "graph": + elif executor.name == "graph": executor_factory = _executor_factory.GraphExecutorFactoryModule( ir_mod, raw_targets, diff --git a/src/relay/backend/executor.cc b/src/relay/backend/executor.cc index bb9706ba86f9b..1d6caecb87ba4 100644 --- a/src/relay/backend/executor.cc +++ b/src/relay/backend/executor.cc @@ -34,6 +34,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& obj, ReprPrinter* p) { const Executor& executor = Downcast(obj); p->stream << executor->name; + p->stream << executor->attrs; }); /********** Registry-related code **********/ diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py index 757e5c1d8af80..d51cfd29dc976 100644 --- a/tests/python/relay/test_build_module.py +++ b/tests/python/relay/test_build_module.py @@ -40,24 +40,29 @@ ], [ Target("c -executor=aot -unpacked-api=1"), - Executor("aot", {"unpacked-api": True}), + Executor("aot", {"unpacked-api": 1}), None, ], [Target("c -executor=aot -link-params=1"), Executor("aot"), None], - [Target("c -link-params=1"), Executor("graph", {"link-params": True}), None], + [Target("c -link-params=1"), Executor("graph", {"link-params": 1}), None], [ Target( "c -executor=aot -link-params=1 -interface-api=c" " -unpacked-api=1 -runtime=c -system-lib" ), - Executor("aot", {"unpacked-api": True, "interface-api": "c"}), + Executor("aot", {"unpacked-api": 1, "interface-api": "c"}), Runtime("crt", {"system-lib": True}), ], ], ) def test_deprecated_target_parameters(target, executor, runtime): actual_executor, actual_runtime = _reconstruct_from_deprecated_options(target) - assert executor == actual_executor + + assert (executor is None and actual_executor is None) or (executor.name == actual_executor.name) + # sort as TVM Map cannot guarantee round-trip order. + assert (executor is None and actual_executor is None) or ( + sorted(executor.attrs.items()) == sorted(actual_executor.attrs.items()) + ) assert runtime == actual_runtime diff --git a/tests/python/relay/test_executor.py b/tests/python/relay/test_executor.py index 866339cb89fed..d703ef1f3d9ae 100644 --- a/tests/python/relay/test_executor.py +++ b/tests/python/relay/test_executor.py @@ -23,12 +23,12 @@ def test_create_executor(): executor = Executor("aot") - assert str(executor) == "aot" + assert executor.name == "aot" def test_create_executor_with_options(): executor = Executor("aot", {"interface-api": "c"}) - assert str(executor) == "aot" + assert executor.name == "aot" assert executor["interface-api"] == "c" @@ -66,7 +66,7 @@ def test_list_executors(): assert "aot" in Executor.list_registered() -@pytest.mark.parametrize("executor", [Executor("aot"), "aot"]) +@pytest.mark.parametrize("executor", [Executor("aot").name, "aot"]) def test_list_executor_options(executor): aot_options = Executor.list_registered_options(executor) assert "interface-api" in aot_options diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py index 7be5037478b13..9b957e617a139 100644 --- a/tests/python/unittest/test_micro_model_library_format.py +++ b/tests/python/unittest/test_micro_model_library_format.py @@ -197,7 +197,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ ) assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) assert metadata["modules"][module_name]["target"] == [str(target)] - if str(executor) == "graph": + if executor.name == "graph": assert metadata["modules"][module_name]["memory"]["sids"] == [ {"storage_id": 0, "size_bytes": 2, "input_binding": "a"}, {"storage_id": 1, "size_bytes": 8, "input_binding": "b"}, @@ -228,7 +228,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_add.h") ) - if str(executor) == "graph": + if executor.name == "graph": validate_graph_json(extract_dir, factory) with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f: From f3359d241f959ac9841c5275f7559c588e2d29a2 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Mon, 11 Jul 2022 20:38:07 -0500 Subject: [PATCH 091/111] [LLVM] Fix build errors in CodeGenCPU::AddDebugInformation (#12054) This code is guarded by TVM_LLVM_VERSION >= 50 and < 70, so the errors were not detected in local tests or in CI. --- src/target/llvm/codegen_cpu.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index e8647545e5f86..b19dc216c8930 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -203,11 +203,12 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { ICHECK(f_llvm->getReturnType() == t_void_ || f_llvm->getReturnType() == t_int_) << "Unexpected return type"; auto ret_type_tir = f_llvm->getReturnType() == t_int_ ? DataType::Int(32) : DataType::Void(); - llvm::DIType* returnTy = GetDebugType(ret_type_tir, f_llvm->getReturnType()); + llvm::DIType* returnTy = + GetDebugType(GetTypeFromRuntimeDataType(ret_type_tir), f_llvm->getReturnType()); paramTys.push_back(returnTy); for (size_t i = 0; i < f_llvm->arg_size(); ++i) { paramTys.push_back( - GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i))); + GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i))); } auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType( dbg_info_->di_builder_->getOrCreateTypeArray(paramTys)); @@ -240,7 +241,7 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { std::string paramName = "arg" + std::to_string(i + 1); auto param = dbg_info_->di_builder_->createParameterVariable( DIFunction, paramName, i + 1, dbg_info_->file_, 0, - GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i)), + GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)), /*alwaysPreserve=*/true); auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca); dbg_info_->di_builder_->insertDeclare(paramAlloca, param, From fc419df32f052e21f614c8940699c10a2d696689 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Mon, 11 Jul 2022 18:39:05 -0700 Subject: [PATCH 092/111] [AOT][BUG] Only include extra headers if the constants array is needed. (#12061) --- src/relay/backend/contrib/codegen_c/codegen.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc index 41f0a0a06408d..dee3f939c50a6 100644 --- a/src/relay/backend/contrib/codegen_c/codegen.cc +++ b/src/relay/backend/contrib/codegen_c/codegen.cc @@ -66,9 +66,6 @@ class CodegenC : public backend::MemoizedExprTranslator>, pu * \return The emitted code. */ std::string JIT(const std::vector& out) override { - if (!ext_func_args_.empty()) { - *needs_extra_headers_ = true; - } // Write function macros for (auto decl : func_decl_) { code_stream_ << decl << "\n"; @@ -109,6 +106,9 @@ class CodegenC : public backend::MemoizedExprTranslator>, pu } std::vector VisitExpr_(const ConstantNode* cn) override { + // Remember we'll need some extra headers to support the runtime constants array. + *needs_extra_headers_ = true; + std::ostringstream decl_stream; std::ostringstream buf_stream; @@ -215,7 +215,10 @@ class CodegenC : public backend::MemoizedExprTranslator>, pu std::unordered_map* const_name_to_constant_; /*! \brief The accumulated constant names, in the order they were generated. */ Array* const_names_; - /*! \brief Set to true if the ndarray and packed function headers are required. */ + /*! + * \brief Set to true if the ndarray and packed function headers are required to declare and + * manage the constants array. + */ bool* needs_extra_headers_; /*! \brief Name of the global function currently being compiled. */ std::string ext_func_id_; From fbf80bb3869f5f5046e3a1a5bb335e21f2f9deae Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Tue, 12 Jul 2022 17:23:44 +0100 Subject: [PATCH 093/111] [microNPU] Add MergeConstants pass (#12029) * [microNPU] Add MergeConstants pass Change-Id: I1ff51d8147fba8c66d442a370b9f058e9b2758d8 * Fix errors and warnings Change-Id: I29f68f83a73fa00ca34ed0ab2321c53c6b761137 * Address comments Change-Id: Iad59107d5abdec6b079c6fd4ab48c6bffbb5e0bb * Fix lint error Change-Id: Ie5caf506337de01e169d6f422e4682eefbd93241 --- .../backend/contrib/ethosu/tir/compiler.py | 4 + .../backend/contrib/ethosu/tir/passes.py | 35 + src/tir/contrib/ethosu/passes.cc | 643 +++++++++++++++++- .../test_ethosu/cascader/test_integration.py | 10 +- .../test_ethosu/test_encode_constants.py | 244 +++---- .../test_ethosu/test_merge_constants.py | 561 +++++++++++++++ .../contrib/test_ethosu/test_networks.py | 14 +- .../test_ethosu/test_remove_concatenates.py | 3 - .../test_ethosu/test_replace_conv2d.py | 24 - .../contrib/test_ethosu/test_replace_copy.py | 37 +- .../contrib/test_ethosu/test_scheduler.py | 24 +- 11 files changed, 1336 insertions(+), 263 deletions(-) create mode 100644 tests/python/contrib/test_ethosu/test_merge_constants.py diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py index 0fd82378c3008..85c6df4c7d0cc 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py @@ -90,6 +90,10 @@ def lower_ethosu(sch, args, const_dict, name="main"): mod = tvm.tir.transform.RemoveNoOp()(mod) mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod) mod = ethosu_passes.HoistAllocates()(mod) + # MergeConstant pass currently does not support striped schedules. + # It requires further investigation. + if not util.is_striping_enabled(): + mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod) mod = ethosu_passes.CopyComputeReordering()(mod) # When striping is enabled and if storage_rewrite is not run diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py index 76726132e05de..c0b017e703ce9 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py @@ -938,3 +938,38 @@ def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRMod The new module with copy and compute nodes reordered. """ return _ffi_api.CopyComputeReordering(max_copy_movements) + + +def MergeConstants(const_dict): + """ + This pass looks for the constants used by each compute operator + and merges them into a single buffer. + Constants written to a buffer with local scope are not merged. + """ + + def _merge_constants(mod): + nonlocal const_dict + try: + mod["main"] + except: + raise tvm.TVMError( + "Expected a single primitive function called 'main'. " + "Please run the MergeConstants pass in conjunction with the LowerToTIR() pass." + ) + + new_const_dict = {} + for param in const_dict.keys(): + new_const_dict[tvm.tir.IntImm("int64", param)] = tvm.nd.array(const_dict[param]) + mod["main"] = mod["main"].with_attr("ethos-u.const_dict", new_const_dict) + + mod = _ffi_api.MergeConstants()(mod) + const_dict = mod["main"].attrs["ethos-u.const_dict"] + mod = _ffi_api.RemoveConstDictAttribute()(mod) + + new_const_dict = {} + for param in const_dict.keys(): + new_const_dict[int(param)] = const_dict[param].numpy() + + return mod, new_const_dict + + return _merge_constants diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 609d986dbb84f..b662e9dfd0255 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -24,10 +24,13 @@ */ #include #include +#include #include #include #include +#include +#include namespace tvm { @@ -42,6 +45,62 @@ namespace tir { namespace contrib { namespace ethosu { +namespace { + +/*! Returns the arguments of the given statement */ +Array GetStmtArgs(const Stmt& stmt) { + auto attr{stmt.as()}; + Stmt eval_stmt{attr ? attr->body : stmt}; + auto eval{eval_stmt.as()}; + ICHECK(eval) << "Expected statement to be an evaluate node, but was " << eval_stmt->GetTypeKey(); + auto call{eval->value.as()}; + ICHECK(call) << "Expected expression to be a call node, but was " << eval->value->GetTypeKey(); + return call->args; +} + +enum class StmtType { global_copy, local_copy, compute }; + +/*! Returns the type of the given statement */ +StmtType GetStmtType(const Stmt& stmt) { + Array args{GetStmtArgs(stmt)}; + if (args[0].as()->value == "ethosu_copy") { + if (args[3].as()->buffer.scope() == "global") { + return StmtType::global_copy; + } else { + return StmtType::local_copy; + } + } + return StmtType::compute; +} +/*! Returns the buffer read my the given copy statement */ +Buffer GetCopyReadBuffer(const Stmt& stmt) { + Array args{GetStmtArgs(stmt)}; + return args[1].as()->buffer; +} + +/*! Returns the buffer written my the given copy statement */ +Buffer GetCopyWriteBuffer(const Stmt& stmt) { + Array args{GetStmtArgs(stmt)}; + return args[3].as()->buffer; +} + +/*! Returns the length of the given copy statement */ +int64_t GetCopyLength(const Stmt& stmt) { + Array args{GetStmtArgs(stmt)}; + return args[2].as()->value; +} + +/*! Returns the cycles of the given statement */ +int64_t GetStmtCycles(const Stmt& stmt) { + auto attr{stmt.as()}; + if (attr && attr->attr_key == "pragma_compute_cycles_hint") { + int64_t cycles{Downcast(attr->value)->value}; + return cycles; + } + return 0; +} +} // namespace + /*! * \brief This mutator moves allocates to the top of the body of the main * function. @@ -154,9 +213,9 @@ class CopyComputeReorderingMutator : public StmtExprMutator { // Each copy statement to a buffer with global scope is moved up // at most `_max_copy_movements` times. for (size_t index = 0; index < new_seq.size(); ++index) { - if (stmt_is_global_copy(new_seq[index])) { + if (GetStmtType(new_seq[index]) == StmtType::global_copy) { int lower = std::max(0, static_cast(index) - _max_copy_movements); - for (int i = index; i > lower && !stmt_is_copy(new_seq[i - 1]); --i) { + for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) { std::swap(new_seq[i - 1], new_seq[i]); } } @@ -167,32 +226,6 @@ class CopyComputeReorderingMutator : public StmtExprMutator { return Stmt{seq_stmt_node}; } - tvm::runtime::Array get_stmt_args(const Stmt& stmt) { - Stmt eval_stmt = stmt; - if (const auto* attr_stmt = eval_stmt.as()) { - eval_stmt = attr_stmt->body; - } - - auto eval_node{eval_stmt.as()}; - ICHECK(eval_node) << "Expected statement to be an evaluate node, but was " - << eval_stmt->GetTypeKey(); - auto call_node{eval_node->value.as()}; - ICHECK(call_node) << "Expected expression to be a call node, but was " - << eval_node->value->GetTypeKey(); - return call_node->args; - } - - bool stmt_is_copy(const Stmt& stmt) { - auto args{get_stmt_args(stmt)}; - return args[0].as()->value == "ethosu_copy"; - } - - bool stmt_is_global_copy(const Stmt& stmt) { - auto args{get_stmt_args(stmt)}; - return args[0].as()->value == "ethosu_copy" && - args[3].as()->buffer.scope() == "global"; - } - /*! The maximum number of movements allowed for a copy. */ int _max_copy_movements; }; @@ -223,6 +256,560 @@ tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements) TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering") .set_body_typed(CopyComputeReordering); +/*! + * \brief This mutator removes all allocates. + */ +class RemoveAllocatesMutator : public StmtExprMutator { + public: + PrimFunc operator()(PrimFunc main_func) { + auto prim_func_node{main_func.CopyOnWrite()}; + prim_func_node->body = this->VisitStmt(main_func->body); + return GetRef(prim_func_node); + } + + private: + Stmt VisitStmt_(const AllocateNode* op) override { return VisitStmt(op->body); } +}; + +/*! + * \brief This extractor collects information used by the MergeConstantsMutator + */ +class MergeConstantsInfoExtractor : public StmtExprVisitor { + public: + class Info { + public: + /*! A stack to store allocates as they are visited. */ + std::vector allocates{}; + + /*! A list that contains in the i-th position the write buffer of the i-th statement + * if that statement is a copy to a buffer with global scope */ + std::vector> copy_write_buffers{}; + + /*! Maps a copy's write buffer to an index representing the + * new buffer and an offset in that buffer */ + std::unordered_map> + old_to_new_write_buffer{}; + + /*! Maps an index representing a new buffer to the length of that buffer */ + std::unordered_map new_buffers_length{}; + + /*! Maps an index representing a new buffer to the cycless needed to copy that buffer */ + std::unordered_map cycless{}; + }; + + Info operator()(PrimFunc main_func) { + this->VisitStmt(main_func->body); + return std::move(_info); + } + + private: + /*! The information collected by this extractor */ + Info _info{}; + + void VisitStmt_(const AllocateNode* op) override { + _info.allocates.push_back(GetRef(op)); + VisitStmt(op->body); + } + + void VisitStmt_(const SeqStmtNode* op) override { + if (op->size() <= 1) { + StmtExprVisitor::VisitStmt_(op); + return; + } + + auto seq_stmt{GetRef(op)}; + for (size_t i = 0; i < seq_stmt.size(); ++i) { + Stmt stmt{seq_stmt[i]}; + switch (GetStmtType(stmt)) { + case StmtType::global_copy: { + Buffer write_buffer{GetCopyWriteBuffer(stmt)}; + _info.copy_write_buffers.push_back(write_buffer); + _info.old_to_new_write_buffer[write_buffer.as()] = std::make_pair(-1, -1); + break; + } + case StmtType::local_copy: { + _info.copy_write_buffers.push_back(Optional{}); + break; + } + case StmtType::compute: { + _info.copy_write_buffers.push_back(Optional{}); + std::vector buffers{GetCopiedBuffersUsedByStmt(stmt)}; + if (buffers.empty()) { + continue; + } + _info.new_buffers_length[i] = 0; + for (Buffer buffer : buffers) { + for (size_t j{i - 1}; j >= 0; --j) { + if (_info.copy_write_buffers[j] == buffer) { + _info.old_to_new_write_buffer[buffer.as()] = + std::make_pair(i, _info.new_buffers_length[i]); + _info.new_buffers_length[i] += GetCopyLength(seq_stmt[j]); + _info.cycless[i] += GetStmtCycles(seq_stmt[j]); + break; + } + } + } + break; + } + } + } + } + + /*! Get all buffers written by copies and used by a given statement */ + std::vector GetCopiedBuffersUsedByStmt(const Stmt& stmt) { + std::vector buffers{}; + for (PrimExpr arg : GetStmtArgs(stmt)) { + if (auto buffer_load = arg.as()) { + Buffer buffer{buffer_load->buffer}; + // Check if the buffer has already been added + if (std::find(buffers.begin(), buffers.end(), buffer) == buffers.end()) { + // Check if the buffer is copied + if (_info.old_to_new_write_buffer.count(buffer.as())) { + buffers.push_back(buffer); + } + } + } + } + return buffers; + } +}; + +/*! + * \brief This mutator looks for the constants used by each compute operator + * and merges them into a single buffer. + * Constants written to a buffer with local scope are not merged. + */ +class MergeConstantsMutator : public StmtExprMutator { + public: + explicit MergeConstantsMutator(MergeConstantsInfoExtractor::Info info) : _info{std::move(info)} {} + + PrimFunc operator()(PrimFunc main_func, const Map& const_dict) { + // Rewrite + Stmt new_body = RewritePrimFuncBody(main_func->body); + std::unordered_set params_to_delete{}; + Map new_buffer_map{MakeNewBufferMap(main_func->buffer_map, ¶ms_to_delete)}; + Array new_params{MakeNewParams(main_func->params, params_to_delete)}; + + // Make the new const dict + Array> args_to_merge{GetArgsToMerge(main_func->buffer_map, main_func->params)}; + Array> buffers_to_merge{ + GetArgsToMergeWithoutArgsNotInConstDict(args_to_merge, const_dict)}; + Map new_const_dict{MakeNewConstDict(buffers_to_merge, const_dict)}; + + // Make the new prim func + auto prim_func_node{main_func.CopyOnWrite()}; + prim_func_node->body = std::move(new_body); + prim_func_node->buffer_map = std::move(new_buffer_map); + prim_func_node->params = std::move(new_params); + prim_func_node->preflattened_buffer_map = {}; + PrimFunc f{GetRef(prim_func_node)}; + + // Add the new const dict as an attribute + f = WithAttr(std::move(f), "ethos-u.const_dict", new_const_dict); + + return f; + } + + private: + /*! The information collected by the MergeConstantsInfoExtractor */ + MergeConstantsInfoExtractor::Info _info; + + /*! Maps an index representing a new buffer to the new buffer */ + std::unordered_map new_buffers{}; + + /*! Maps a copy's read buffer to the new copy's read buffer */ + std::unordered_map old_to_new_read_buffers{}; + + /*! Maps an index representing a new buffer to the list of buffers to be merged in the new buffer + */ + std::unordered_map> buffers_to_merge{}; + + /*! A set of buffers to delete */ + std::unordered_set buffers_to_delete{}; + + Stmt RewritePrimFuncBody(Stmt body) { + std::unordered_map var_to_allocate{}; + + // Rewrite old allocates + std::unordered_set buffer_vars{GetVarsForWrittenCopyBuffers()}; + for (auto it{_info.allocates.rbegin()}; it != _info.allocates.rend(); ++it) { + Allocate alloc{*it}; + var_to_allocate[alloc->buffer_var.get()] = alloc; + if (buffer_vars.count(alloc->buffer_var.as()) == 0) { + body = Allocate(alloc->buffer_var, alloc->dtype, alloc->extents, alloc->condition, body, + alloc->annotations, alloc->span); + } + } + + // Rewrite new allocates + for (auto it{_info.copy_write_buffers.rbegin()}; it != _info.copy_write_buffers.rend(); ++it) { + if (Optional buffer_opt = *it) { + Buffer old_write_buffer{buffer_opt.value()}; + int new_buffer_index{ + _info.old_to_new_write_buffer[old_write_buffer.as()].first}; + + // Check if the allocate has already been created + if (new_buffers.count(new_buffer_index) == 0) { + BufferNode* new_buffer{old_write_buffer.CopyOnWrite()}; + new_buffer->shape = {_info.new_buffers_length[new_buffer_index]}; + + new_buffers[new_buffer_index] = GetRef(new_buffer); + + Allocate old_allocate{var_to_allocate[old_write_buffer->data.get()]}; + body = Allocate(new_buffer->data, new_buffer->dtype, new_buffer->shape, tir::const_true(), + body, old_allocate->annotations, old_allocate->span); + } + } + } + + // Rewrite operators + return this->VisitStmt(body); + } + + Stmt VisitStmt_(const AllocateNode* op) override { + auto allocate{CopyOnWrite(op)}; + allocate->body = this->VisitStmt(op->body); + return Stmt(allocate); + } + + Stmt VisitStmt_(const SeqStmtNode* op) override { + if (op->size() <= 1) { + return StmtExprMutator::VisitStmt_(op); + } + + Array new_seq{}; + SeqStmt seq_stmt{GetRef(op)}; + for (size_t i{0}; i < seq_stmt.size(); ++i) { + Stmt stmt{seq_stmt[i]}; + + switch (GetStmtType(stmt)) { + case StmtType::global_copy: { + Buffer old_write_buffer{_info.copy_write_buffers[i].value()}; + std::pair pair{ + _info.old_to_new_write_buffer[old_write_buffer.as()]}; + int new_buffer_index{pair.first}; + int new_buffer_offset{pair.second}; + UpdateBuffersToMergeAndDelete(stmt, new_buffer_index, new_buffer_offset); + + if (!IsCopyToBeDeleted(new_buffer_offset)) { + Optional cycless{GetMergedCycles(new_buffer_index)}; + new_seq.push_back(MakeNewStmt( + stmt, MakeNewCopyArgs(stmt, old_write_buffer, new_buffer_index), cycless)); + } + break; + } + case StmtType::local_copy: { + new_seq.push_back(stmt); + break; + } + case StmtType::compute: { + new_seq.push_back(MakeNewStmt(stmt, MakeNewComputeArgs(stmt))); + break; + } + } + } + return SeqStmt(new_seq, op->span); + } + + /*! Returns the variables of the buffers written by copies */ + std::unordered_set GetVarsForWrittenCopyBuffers() { + std::unordered_set buffer_vars{}; + std::transform(_info.old_to_new_write_buffer.begin(), _info.old_to_new_write_buffer.end(), + std::inserter(buffer_vars, buffer_vars.begin()), + [](std::pair> pair) -> const VarNode* { + return pair.first->data.as(); + }); + return buffer_vars; + } + + /*! Returns the cycles of the new buffer at the given index */ + Optional GetMergedCycles(int new_buffer_index) { + auto it = _info.cycless.find(new_buffer_index); + if (it != _info.cycless.end()) { + return Integer(it->second); + } + return Optional{}; + } + + /*! Returns true if a copy must be deleted, false otherwise */ + bool IsCopyToBeDeleted(int new_buffer_offset) { return new_buffer_offset > 0; } + + Array MakeNewCopyArgs(const Stmt& stmt, const Buffer& old_write_buffer, + int new_buffer_index) { + Array args{GetStmtArgs(stmt)}; + int new_length{_info.new_buffers_length[new_buffer_index]}; + + Array new_args{}; + for (size_t i = 0; i < args.size(); ++i) { + switch (i) { + case 1: /* read_address */ { + auto buffer_load = args[1].as(); + Buffer buffer{buffer_load->buffer}; + Buffer new_buffer{buffer->data, + buffer->dtype, + {new_length}, + buffer->strides, + buffer->elem_offset, + buffer->name, + buffer->data_alignment, + buffer->offset_factor, + buffer->buffer_type, + buffer->axis_separators, + buffer->span}; + old_to_new_read_buffers[buffer.as()] = new_buffer; + new_args.push_back(BufferLoad(new_buffer, buffer_load->indices, buffer_load->span)); + break; + } + case 2: /* length */ { + new_args.push_back(new_length); + break; + } + case 3: /* write_address */ { + new_args.push_back(MakeNewBufferLoad(old_write_buffer, 0, true).value()); + break; + } + default: + new_args.push_back(args[i]); + break; + } + } + return new_args; + } + + Array MakeNewComputeArgs(const Stmt& stmt) { + Array args{GetStmtArgs(stmt)}; + Array new_args{}; + for (size_t i = 0; i < args.size(); ++i) { + if (auto buffer_load = args[i].as()) { + BufferLoad new_buffer_load{ + MakeNewBufferLoad(buffer_load->buffer, buffer_load->indices[0], false) + .value_or(GetRef(buffer_load))}; + new_args.push_back(new_buffer_load); + } else { + new_args.push_back(args[i]); + } + } + return new_args; + } + + Stmt MakeNewStmt(const Stmt& stmt, const Array& new_args, + Optional cycless = Optional{}) { + auto attr{stmt.as()}; + Stmt eval_stmt{attr ? attr->body : stmt}; + auto eval{eval_stmt.as()}; + ICHECK(eval) << "Expected statement to be an evaluate node, but was " + << eval_stmt->GetTypeKey(); + auto call{eval->value.as()}; + ICHECK(call) << "Expected expression to be a call node, but was " << eval->value->GetTypeKey(); + + Call new_call{call->dtype, call->op, new_args, call->span}; + Evaluate new_eval{new_call, eval->span}; + + if (attr) { + ICHECK(attr->attr_key == "pragma_compute_cycles_hint"); + PrimExpr value = cycless.value_or(attr->value); + return AttrStmt{attr->node, attr->attr_key, value, new_eval, attr->span}; + } else { + return std::move(new_eval); + } + } + + Optional MakeNewBufferLoad(const Buffer& write_buffer, const PrimExpr& old_index, + bool only_old_index) { + auto it = _info.old_to_new_write_buffer.find(write_buffer.as()); + if (it != _info.old_to_new_write_buffer.end()) { + std::pair pair{it->second}; + int new_buffer_index{pair.first}; + PrimExpr new_index{only_old_index ? old_index : (pair.second + old_index)}; + return BufferLoad{new_buffers[new_buffer_index], {new_index}}; + } + return Optional{}; + } + + Map MakeNewBufferMap(const Map& buffer_map, + std::unordered_set* params_to_delete) { + Map new_buffer_map{}; + for (std::pair pair : buffer_map) { + Var var{pair.first}; + Buffer buffer{pair.second}; + + if (buffers_to_delete.count(buffer.as()) == 1) { + params_to_delete->insert(var.as()); + } else if (old_to_new_read_buffers.count(buffer.as()) == 1) { + new_buffer_map.Set(var, old_to_new_read_buffers[buffer.as()]); + } else { + new_buffer_map.Set(var, buffer); + } + } + return new_buffer_map; + } + + Array MakeNewParams(const Array& params, + const std::unordered_set& params_to_delete) { + std::vector new_params{}; + for (Var var : params) { + if (params_to_delete.count(var.as()) == 0) { + new_params.push_back(var); + } + } + return new_params; + } + + void UpdateBuffersToMergeAndDelete(const Stmt& stmt, int new_buffer_index, + int new_buffer_offset) { + Array args{GetStmtArgs(stmt)}; + Buffer read_buffer{GetCopyReadBuffer(stmt)}; + + if (buffers_to_merge.count(new_buffer_index) == 0) { + buffers_to_merge[new_buffer_index] = std::vector{read_buffer}; + } else { + buffers_to_merge[new_buffer_index].push_back(read_buffer); + } + + if (new_buffer_offset > 0) { + buffers_to_delete.insert(read_buffer.as()); + } + } + + /*! Returns an array whose elements are the indices of the function arguments to be merged. + * Example: if a function has three arguments and the second and the third ones must + * be merged then the array is: [[0], [1, 2], [3]] */ + Array> GetArgsToMerge(const Map& buffer_map, + const Array& params) { + std::unordered_map buffer_to_var{}; + for (std::pair var_buffer : buffer_map) { + buffer_to_var[var_buffer.second.as()] = var_buffer.first; + } + + std::unordered_map var_to_index{}; + for (int i = 0; i < static_cast(params.size()); ++i) { + var_to_index[params[i].as()] = i; + } + + std::vector> vector{}; + for (std::pair> index_vector : buffers_to_merge) { + std::vector indices{}; + for (Buffer buffer : index_vector.second) { + const VarNode* var{buffer_to_var[buffer.as()].as()}; + IntImm index{DataType::Int(64), var_to_index[var]}; + var_to_index.erase(var); + auto it = std::find_if(indices.begin(), indices.end(), + [&](IntImm value) { return value->value == index->value; }); + if (it == indices.end()) { + indices.push_back(index); + } + } + vector.push_back(Array{indices}); + } + + for (std::pair var_index : var_to_index) { + vector.push_back(Array{IntImm(DataType::Int(64), var_index.second)}); + } + std::sort(vector.begin(), vector.end(), + [](Array a, Array b) { return a[0]->value < b[0]->value; }); + return vector; + } + + Array> GetArgsToMergeWithoutArgsNotInConstDict( + const Array>& args_to_merge, const Map& const_dict) { + Array> new_args_to_merge{}; + for (Array args : args_to_merge) { + IntImm key{args[0]}; + auto it = std::find_if(const_dict.begin(), const_dict.end(), + [&](std::pair pair) { + return pair.first->value == key->value; + }); + if (it != const_dict.end()) { + new_args_to_merge.push_back(args); + } + } + return new_args_to_merge; + } + + Map MakeNewConstDict(const Array>& args_to_merge, + Map const_dict) { + Map new_const_dict{}; + if (args_to_merge.size() == 0) { + return new_const_dict; + } + + int64_t key = args_to_merge[0][0]->value; + for (Array args : args_to_merge) { + int64_t size = 0; + for (IntImm arg : args) { + auto it = std::find_if(const_dict.begin(), const_dict.end(), + [&](auto pair) { return pair.first->value == arg->value; }); + runtime::NDArray arg_constant{(*it).second}; + size += runtime::GetDataSize(*arg_constant.operator->()); + } + + runtime::NDArray constant = runtime::NDArray::Empty({size}, DataType::UInt(8), {kDLCPU, 0}); + + size_t offset = 0; + for (IntImm arg : args) { + auto it = std::find_if(const_dict.begin(), const_dict.end(), + [&](auto pair) { return pair.first->value == arg->value; }); + runtime::NDArray arg_constant{(*it).second}; + size_t nbytes = runtime::GetDataSize(*arg_constant.operator->()); + arg_constant.CopyToBytes(static_cast(constant->data) + offset, nbytes); + offset += nbytes; + } + new_const_dict.Set(IntImm(DataType::Int(64), key), constant); + key += 1; + } + return new_const_dict; + } +}; + +/*! + * \brief This pass looks for the constants used by each compute operator + * and merges them into a single buffer. + * Constants written to a buffer with local scope are not merged. + * \return tvm::transform::Pass + */ +tvm::transform::Pass MergeConstants() { + auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) { + ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main")) + << "Expected a single primitive function called 'main'. Please run the " + "MergeConstants pass in conjunction with the LowerToTIR() pass."; + Optional> const_dict{ + f->attrs.GetAttr("ethos-u.const_dict", Optional>{})}; + ICHECK(const_dict) << "Expected a ethos-u.const_dict attribute"; + + MergeConstantsInfoExtractor::Info info{MergeConstantsInfoExtractor()(f)}; + f = RemoveAllocatesMutator()(f); + return MergeConstantsMutator(info)(f, const_dict.value()); + }; + return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, "tir.contrib.ethos-u.MergeConstants", + {}); +} + +TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.MergeConstants").set_body_typed(MergeConstants); + +/*! + * \brief This pass removes the ethos-u.const_dict attribute + * \return tvm::transform::Pass + */ +class RemoveConstDictAttributeMutator : public StmtExprMutator { + public: + RemoveConstDictAttributeMutator() {} + + PrimFunc operator()(PrimFunc main_func) { + return WithoutAttr(std::move(main_func), "ethos-u.const_dict"); + } +}; + +tvm::transform::Pass RemoveConstDictAttribute() { + auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) { + return RemoveConstDictAttributeMutator()(f); + }; + return tvm::tir::transform::CreatePrimFuncPass( + pass_func, 0, "tir.contrib.ethos-u.RemoveConstDictAttribute", {}); +} + +TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.RemoveConstDictAttribute") + .set_body_typed(RemoveConstDictAttribute); + } // namespace ethosu } // namespace contrib } // namespace tir diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py b/tests/python/contrib/test_ethosu/cascader/test_integration.py index 8e1f020861d5f..14cc8fbc61cfc 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_integration.py +++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py @@ -109,9 +109,8 @@ def test_single_conv_compute_cycles_hint(): for single convolution. """ primfunc = _compile_model(_create_single_conv2d()) - ops = primfunc.body.body.body.seq - - compute_cycles_hints = [2304, 640, 320] + ops = primfunc.body.body.seq + compute_cycles_hints = [2944, 320] for op, compute_cycle_hint in zip(ops, compute_cycles_hints): assert op.attr_key == "pragma_compute_cycles_hint" assert op.value == compute_cycle_hint @@ -123,9 +122,8 @@ def test_double_conv_compute_cycles_hint(): for double convolution. """ primfunc = _compile_model(_create_double_conv2d()) - ops = primfunc.body.body.body.body.body.body.seq - - compute_cycles_hints = [2304, 640, 768, 640, 320, 240] + ops = primfunc.body.body.body.body.seq + compute_cycles_hints = [2944, 1408, 320, 240] for op, compute_cycle_hint in zip(ops, compute_cycles_hints): assert op.attr_key == "pragma_compute_cycles_hint" assert op.value == compute_cycle_hint diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py index 15b719f33c3f9..fd9f373739e16 100644 --- a/tests/python/contrib/test_ethosu/test_encode_constants.py +++ b/tests/python/contrib/test_ethosu/test_encode_constants.py @@ -37,34 +37,23 @@ class WeightStreamOnlyU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer1 = T.buffer_decl([128], "uint8") - buffer2 = T.buffer_decl([32], "uint8") - buffer3 = T.buffer_decl([112], "uint8") - buffer4 = T.buffer_decl([32], "uint8") - buffer5 = T.buffer_decl([112], "uint8") - buffer6 = T.buffer_decl([32], "uint8") - buffer7 = T.buffer_decl([112], "uint8") + buffer1 = T.buffer_decl([160], "uint8") + buffer3 = T.buffer_decl([144], "uint8") + buffer5 = T.buffer_decl([144], "uint8") + buffer7 = T.buffer_decl([144], "uint8") buffer8 = T.buffer_decl([32], "uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) - p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - buffer9 = T.buffer_decl([112], "uint8", data=p1.data) - T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True}) + buffer9 = T.buffer_decl([144], "uint8", data=p1.data) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 160, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 144, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p1[128], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 144, buffer9[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, T.int8(-1), T.int8(-1), 12, p2[112], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 144, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, buffer9[112], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, T.int8(-1), T.int8(-1), 12, p2[112], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -75,34 +64,22 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) - buffer_encoded_1 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_1_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_2_1 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_4_1 = T.buffer_decl([176], dtype="uint8") - buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_6_1 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_1 = T.buffer_decl([192], dtype="uint8") + buffer_encoded_2_1 = T.buffer_decl([192], dtype="uint8") + buffer_encoded_4_1 = T.buffer_decl([208], dtype="uint8") + buffer_encoded_6_1 = T.buffer_decl([192], dtype="uint8") # body - placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_2 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 160, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_1[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 160, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 176, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 160, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.buffer_decl([192], dtype="uint8", data=p1.data) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 80, p3[80], 80, 12, p3[160], 16, p3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 208, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 192, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 96, p1[96], 80, 12, p1[176], 16, p1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -113,12 +90,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), ( "ethos-u55-128", WeightStreamOnlyU55, - [128, 32, 112, 32, 112, 32, 112, 32], + [160, 144, 144, 144], ), ( "ethos-u65-512", WeightStreamOnlyU65, - [160, 32, 160, 32, 176, 32, 160, 32], + [192, 192, 208, 192], ), ], ) @@ -160,7 +137,7 @@ def _get_func(): tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True) test_const_size = [value.size for value in list(consts.values())] - assert reference_const_sizes == test_const_size + assert reference_const_sizes.sort() == test_const_size.sort() # fmt: off @@ -170,21 +147,14 @@ class RereadWeightsU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer1 = T.buffer_decl([304], "uint8") - buffer2 = T.buffer_decl([80], "uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) + buffer1 = T.buffer_decl([384], "uint8") # body - p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) - p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p2[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 304, T.int8(-1), T.int8(-1), 12, p4[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p1[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 304, T.int8(-1), T.int8(-1), 12, p2[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -195,21 +165,14 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) - placeholder_encoded_1 = T.buffer_decl([368], "uint8") - placeholder_encoded_1_2 = T.buffer_decl([96], "uint8") + placeholder_encoded_1 = T.buffer_decl([464], "uint8") # body - placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 192, p2[192], 176, 12, p2[368], 48, p2[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -221,12 +184,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), ( "ethos-u55-128", RereadWeightsU55, - [304, 80], + [384], ), ( "ethos-u65-512", RereadWeightsU65, - [368, 96], + [464], ), ], ) @@ -268,7 +231,7 @@ def _get_func(): tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True) test_const_size = [value.size for value in list(consts.values())] - assert reference_const_sizes == test_const_size + assert reference_const_sizes.sort() == test_const_size.sort() # fmt: off @@ -282,8 +245,6 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), buffer_1 = T.buffer_decl([160], "uint8") buffer_2 = T.buffer_decl([160], "uint8") buffer_3 = T.buffer_decl([80], "uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -302,8 +263,6 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8") placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8") placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) # body ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -364,87 +323,64 @@ def _get_func(): tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True) test_const_size = [value.size for value in list(consts.values())] - assert reference_const_sizes == test_const_size + assert reference_const_sizes.sort() == test_const_size.sort() # fmt: off @tvm.script.ir_module class MixedReadU55: @T.prim_func - def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: + def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(112,), "uint8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer1 = T.buffer_decl([80], "uint8") - buffer2 = T.buffer_decl([32], "uint8") - buffer3 = T.buffer_decl([80], "uint8") - buffer4 = T.buffer_decl([32], "uint8") - buffer5 = T.buffer_decl([80], "uint8") - buffer6 = T.buffer_decl([32], "uint8") - buffer7 = T.buffer_decl([80], "uint8") - buffer8 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([112], "uint8") + buffer3 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([112], "uint8") buffer9 = T.buffer_decl([592], "uint8") buffer10 = T.buffer_decl([160], "uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) + buffer11 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 112, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @tvm.script.ir_module class MixedReadU65: @T.prim_func - def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: + def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) - T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) + # buffer definition - buffer_encoded_1 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_1_2 = T.buffer_decl([32], dtype="uint8") - placeholder_encoded_1 = T.buffer_decl([608], dtype="uint8") - placeholder_encoded_1_2 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_2_1 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_4_1 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_6_1 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8") - placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_2 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 96, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_2[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_1[0], 304, placeholder_encoded_1[304], 304, 12, placeholder_encoded_1_2[0], 80, placeholder_encoded_1_2[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 96, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 96, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 96, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + buffer1 = T.buffer_decl([128], dtype="uint8") + buffer2 = T.buffer_decl([128], dtype="uint8") + buffer3 = T.buffer_decl([128], dtype="uint8") + buffer4 = T.buffer_decl([608], dtype="uint8") + buffer5 = T.buffer_decl([160], dtype="uint8") + buffer6 = T.buffer_decl([2048], dtype="int8") + p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -455,12 +391,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), ( "ethos-u55-128", MixedReadU55, - [592, 160, 80, 32, 80, 32, 80, 32, 80, 32], + [592, 160, 112, 112, 112, 112], ), ( "ethos-u65-512", MixedReadU65, - [608, 160, 96, 32, 96, 32, 96, 32, 96, 32], + [608, 160, 128, 128, 128, 128], ), ], ) @@ -512,7 +448,7 @@ def _get_func(): tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True) test_const_size = [value.size for value in list(consts.values())] - assert reference_const_sizes == test_const_size + assert reference_const_sizes.sort() == test_const_size.sort() def test_constant_as_input(): @@ -543,7 +479,7 @@ def get_graph(): # Check tile address for the scalar constant input hasn't been # overwritten. - extern_calls = tir_mod["main"].body.body.body.body.body + extern_calls = tir_mod["main"].body.body.body.body binary_elementwise = extern_calls[-1].value args = binary_elementwise.args diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py new file mode 100644 index 0000000000000..caf09abdb020c --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_merge_constants.py @@ -0,0 +1,561 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +pytest.importorskip("ethosu.vela") + +import tvm +from tvm.script import tir as T +from tvm.relay.backend.contrib.ethosu.tir.passes import MergeConstants +import numpy as np + + +def check_const_dictionaries(const_dict, new_const_dict): + assert list(const_dict) == list(new_const_dict) + for key, value in const_dict.items(): + new_value = new_const_dict[key] + assert len(value) == len(new_value) + for i in range(len(value)): + assert value[i] == new_value[i] + + +def test_only_one_operator(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(buffer2: T.Buffer[(160,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p4 = T.allocate([160], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + const_dict = { + 0: np.array([0, 10], dtype=np.uint8), + 1: np.array([1, 11], dtype=np.uint8), + } + new_const_dict = {0: np.concatenate((const_dict[0], const_dict[1]))} + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_all_operators_with_weights(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer4: T.Buffer[(112,), "uint8"], buffer5: T.Buffer[(32,), "uint8"], buffer6: T.Buffer[(112,), "uint8"], buffer7: T.Buffer[(32,), "uint8"], buffer8: T.Buffer[(112,), "uint8"], buffer9: T.Buffer[(32,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([3], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer6: T.Buffer[(144,), "uint8"], buffer8: T.Buffer[(144,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p4 = T.allocate([160], "uint8", "global") + p7 = T.allocate([144], "uint8", "global") + p10 = T.allocate([144], "uint8", "global") + p11 = T.allocate([144], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 144, p10[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p7[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 144, p11[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p10[0], 112, 12, p10[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p11[0], 112, 12, p11[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + const_dict = { + 0: np.array([0], dtype=np.uint8), + 1: np.array([1], dtype=np.uint8), + 2: np.array([2], dtype=np.uint8), + 3: np.array([3], dtype=np.uint8), + 4: np.array([4], dtype=np.uint8), + 5: np.array([5], dtype=np.uint8), + 6: np.array([6], dtype=np.uint8), + 7: np.array([7], dtype=np.uint8), + } + new_const_dict = { + 0: np.concatenate((const_dict[0], const_dict[1])), + 1: np.concatenate((const_dict[2], const_dict[3])), + 2: np.concatenate((const_dict[4], const_dict[5])), + 3: np.concatenate((const_dict[6], const_dict[7])), + } + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_operators_with_and_without_weights(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer0 = T.buffer_decl([390336], "int8") + buffer1 = T.buffer_decl([97156], "int8") + buffer6 = T.buffer_decl([390336], "int8") + # body + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(buffer2: T.Buffer[(144,), "uint8"]) -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer0 = T.buffer_decl([390336], "int8") + buffer1 = T.buffer_decl([97156], "int8") + buffer6 = T.buffer_decl([390336], "int8") + # body + p3 = T.allocate([144], "uint8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 144, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p3[0], 80, 0, p3[80], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + const_dict = { + 0: np.array([0], dtype=np.uint8), + 1: np.array([1], dtype=np.uint8), + } + new_const_dict = {0: np.concatenate((const_dict[0], const_dict[1]))} + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_copy_to_buffer_with_local_scope(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(buffer1: T.Buffer[(64,), "uint8"], + buffer2: T.Buffer[(48,), "uint8"], + buffer3: T.Buffer[(256,), "uint8"], + buffer4: T.Buffer[(256,), "uint8"], + buffer5: T.Buffer[(16,), "uint8"], + buffer6: T.Buffer[(48,), "uint8"], + buffer7: T.Buffer[(256,), "uint8"], + buffer8: T.Buffer[(64,), "uint8"], + buffer9: T.Buffer[(256,), "int8"], + ) -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) # Local + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(buffer1: T.Buffer[(64,), "uint8"], + buffer2: T.Buffer[(96,), "uint8"], + buffer4: T.Buffer[(256,), "uint8"], + buffer5: T.Buffer[(64,), "uint8"], + buffer7: T.Buffer[(256,), "uint8"], + buffer8: T.Buffer[(64,), "uint8"], + buffer9: T.Buffer[(256,), "int8"], + ) -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + # body + p1 = T.allocate([96], "uint8", "global") + p2 = T.allocate([64], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 64, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p1[48], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) # Local + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p2[0], 16, 0, p2[16], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + const_dict = { + 1: np.array([1], dtype=np.uint8), + 2: np.array([2], dtype=np.uint8), + 3: np.array([3], dtype=np.uint8), + 4: np.array([4], dtype=np.uint8), + 5: np.array([5], dtype=np.uint8), + 6: np.array([6], dtype=np.uint8), + } + new_const_dict = { + 1: np.concatenate((const_dict[1], const_dict[2])), + 2: const_dict[3], + 3: np.concatenate((const_dict[4], const_dict[5])), + 4: const_dict[6], + } + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_no_copies(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + placeholder = T.buffer_decl([20], "int8") + ethosu_write = T.buffer_decl([16], "int8") + # body + ethosu_write_4 = T.allocate([16], "int8", "global") + T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle")) + T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + placeholder = T.buffer_decl([20], "int8") + ethosu_write = T.buffer_decl([16], "int8") + # body + ethosu_write_4 = T.allocate([16], "int8", "global") + T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle")) + T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + const_dict = {} + new_const_dict = {} + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_copies_to_the_same_buffer(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(buffer2: T.Buffer[(160,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p5 = T.allocate([160], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + const_dict = { + 0: np.array([0], dtype=np.uint8), + 1: np.array([1], dtype=np.uint8), + } + new_const_dict = {0: np.concatenate((const_dict[0], const_dict[1]))} + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_read_from_the_same_buffer(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + # buffer definition + T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) + T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) + # body + p1 = T.allocate([368], "uint8", "global") + p2 = T.allocate([96], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + __tvm_meta__ = None + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + # body + p1 = T.allocate([464], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + __tvm_meta__ = None + # fmt: on + + const_dict = { + 1: np.array([1], dtype=np.uint8), + 2: np.array([2], dtype=np.uint8), + } + new_const_dict = {1: np.concatenate((const_dict[1], const_dict[2]))} + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_cycle_count(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer4: T.Buffer[(112,), "uint8"], buffer5: T.Buffer[(32,), "uint8"], buffer6: T.Buffer[(112,), "uint8"], buffer7: T.Buffer[(32,), "uint8"], buffer8: T.Buffer[(112,), "uint8"], buffer9: T.Buffer[(32,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + v1a = T.var("int32") + v1b = T.var("int32") + v1c = T.var("int32") + v2a = T.var("int32") + v2b = T.var("int32") + v2c = T.var("int32") + v3a = T.var("int32") + v3b = T.var("int32") + v3c = T.var("int32") + v4a = T.var("int32") + v4b = T.var("int32") + v4c = T.var("int32") + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([3], "uint8", "global") + with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 100): + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + with T.attr(T.iter_var(v1b, None, "DataPar", ""), "pragma_compute_cycles_hint", 101): + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 102): + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + with T.attr(T.iter_var(v2b, None, "DataPar", ""), "pragma_compute_cycles_hint", 103): + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + with T.attr(T.iter_var(v1c, None, "DataPar", ""), "pragma_compute_cycles_hint", 300): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + with T.attr(T.iter_var(v3a, None, "DataPar", ""), "pragma_compute_cycles_hint", 104): + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + with T.attr(T.iter_var(v3b, None, "DataPar", ""), "pragma_compute_cycles_hint", 105): + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + with T.attr(T.iter_var(v2c, None, "DataPar", ""), "pragma_compute_cycles_hint", 301): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + with T.attr(T.iter_var(v4a, None, "DataPar", ""), "pragma_compute_cycles_hint", 106): + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + with T.attr(T.iter_var(v4b, None, "DataPar", ""), "pragma_compute_cycles_hint", 107): + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + with T.attr(T.iter_var(v3c, None, "DataPar", ""), "pragma_compute_cycles_hint", 302): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + with T.attr(T.iter_var(v4c, None, "DataPar", ""), "pragma_compute_cycles_hint", 303): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + + + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer6: T.Buffer[(144,), "uint8"], buffer8: T.Buffer[(144,), "uint8"]) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + v1a = T.var("int32") + v1c = T.var("int32") + v2a = T.var("int32") + v2c = T.var("int32") + v3a = T.var("int32") + v3c = T.var("int32") + v4a = T.var("int32") + v4c = T.var("int32") + buffer1 = T.buffer_decl([8192], "int8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p4 = T.allocate([160], "uint8", "global") + p7 = T.allocate([144], "uint8", "global") + p10 = T.allocate([144], "uint8", "global") + p11 = T.allocate([144], "uint8", "global") + with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 201): + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle")) + with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 205): + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle")) + with T.attr(T.iter_var(v1c, None, "DataPar", ""), "pragma_compute_cycles_hint", 300): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + with T.attr(T.iter_var(v3a, None, "DataPar", ""), "pragma_compute_cycles_hint", 209): + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 144, p10[0], dtype="handle")) + with T.attr(T.iter_var(v2c, None, "DataPar", ""), "pragma_compute_cycles_hint", 301): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p7[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + with T.attr(T.iter_var(v4a, None, "DataPar", ""), "pragma_compute_cycles_hint", 213): + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 144, p11[0], dtype="handle")) + with T.attr(T.iter_var(v3c, None, "DataPar", ""), "pragma_compute_cycles_hint", 302): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p10[0], 112, 12, p10[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + with T.attr(T.iter_var(v4c, None, "DataPar", ""), "pragma_compute_cycles_hint", 303): + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p11[0], 112, 12, p11[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + const_dict = { + 0: np.array([0], dtype=np.uint8), + 1: np.array([1], dtype=np.uint8), + 2: np.array([2], dtype=np.uint8), + 3: np.array([3], dtype=np.uint8), + 4: np.array([4], dtype=np.uint8), + 5: np.array([5], dtype=np.uint8), + 6: np.array([6], dtype=np.uint8), + 7: np.array([7], dtype=np.uint8), + } + new_const_dict = { + 0: np.concatenate((const_dict[0], const_dict[1])), + 1: np.concatenate((const_dict[2], const_dict[3])), + 2: np.concatenate((const_dict[4], const_dict[5])), + 3: np.concatenate((const_dict[6], const_dict[7])), + } + test_mod, const_dict = MergeConstants(const_dict)(InputModule) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + check_const_dictionaries(const_dict, new_const_dict) + + +def test_multiple_prim_funcs(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(): + T.evaluate(0) + + @T.prim_func + def abc(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the MergeConstants pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + MergeConstants({})(InputModule) + + +def test_no_main_prim_func(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def abs(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the MergeConstants pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + MergeConstants({})(InputModule) diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index 075565cd92a6c..02643f6c1ded1 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -44,13 +44,13 @@ @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1892704), - ("ethos-u65-256", MOBILENET_V2_URL, 2257984), - ("ethos-u55-256", MOBILENET_V1_URL, 1892704), - ("ethos-u55-256", MOBILENET_V2_URL, 2257984), - ("ethos-u55-128", MOBILENET_V2_URL, 2257984), - ("ethos-u55-64", MOBILENET_V2_URL, 2257984), - ("ethos-u55-32", MOBILENET_V2_URL, 2258000), + ("ethos-u65-256", MOBILENET_V1_URL, 1793376), + ("ethos-u65-256", MOBILENET_V2_URL, 2218160), + ("ethos-u55-256", MOBILENET_V1_URL, 1793376), + ("ethos-u55-256", MOBILENET_V2_URL, 2218160), + ("ethos-u55-128", MOBILENET_V2_URL, 2218160), + ("ethos-u55-64", MOBILENET_V2_URL, 2218160), + ("ethos-u55-32", MOBILENET_V2_URL, 2218160), ], ) def test_networks_without_usmp(accel_type, model_url, workspace_size): diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py index cc996e59412ce..d2c759a0ae4dc 100644 --- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py +++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py @@ -41,9 +41,6 @@ def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,) buffer_5 = T.buffer_decl([160], "uint8") buffer_6 = T.buffer_decl([2992], "uint8") buffer_7 = T.buffer_decl([160], "uint8") - T.preflattened_buffer(placeholder, [1, 8, 12, 16], "int8", data=placeholder.data) - T.preflattened_buffer(placeholder_1, [1, 8, 10, 16], "int8", data=placeholder_1.data) - T.preflattened_buffer(T_concat, [1, 8, 32, 16], "int8", data=T_concat.data) # body T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py index 63f9fc44c7788..46a3c5a15bf5b 100644 --- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py +++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py @@ -373,8 +373,6 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512, buffer_1 = T.buffer_decl([80], "uint8") buffer_2 = T.buffer_decl([320], "uint8") buffer_3 = T.buffer_decl([160], "uint8") - T.preflattened_buffer(placeholder_5, [1, 8, 8, 3], 'int8', data=placeholder_5.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data) # body ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -394,8 +392,6 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512, buffer_1 = T.buffer_decl([320], "uint8") buffer_2 = T.buffer_decl([1312], "uint8") buffer_3 = T.buffer_decl([2608], "uint8") - T.preflattened_buffer(placeholder_5, [1, 8, 8, 3], 'int8', data=placeholder_5.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data) # body ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -415,8 +411,6 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640, buffer_1 = T.buffer_decl([80], "uint8") buffer_2 = T.buffer_decl([320], "uint8") buffer_3 = T.buffer_decl([880], "uint8") - T.preflattened_buffer(placeholder_5, [1, 16, 16, 3], 'int8', data=placeholder_5.data) - T.preflattened_buffer(ethosu_write_1, [1, 20, 4, 8], 'int8', data=ethosu_write_1.data) # body ethosu_write_2 = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -438,8 +432,6 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204 buffer_1 = T.buffer_decl([352], "uint8") buffer_2 = T.buffer_decl([272], "uint8") buffer_3 = T.buffer_decl([11040], "uint8") - T.preflattened_buffer(placeholder_5, [1, 8, 1, 8, 16], 'int8', data=placeholder_5.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 2, 8, 16], 'int8', data=ethosu_write_1.data) # body ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -459,8 +451,6 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,), buffer_1 = T.buffer_decl([320], "uint8") buffer_2 = T.buffer_decl([304], "uint8") buffer_3 = T.buffer_decl([80], "uint8") - T.preflattened_buffer(placeholder, [1, 8, 8, 3], 'int8', data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 32, 32, 8], 'int8', data=ethosu_write.data) # body ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle")) @@ -480,8 +470,6 @@ def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,) buffer_1 = T.buffer_decl([352], "uint8") buffer_2 = T.buffer_decl([11040], "uint8") buffer_3 = T.buffer_decl([272], "uint8") - T.preflattened_buffer(placeholder, [1, 8, 1, 8, 16], 'int8', data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 32, 2, 32, 16], 'int8', data=ethosu_write.data) # body ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True}) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle")) @@ -641,8 +629,6 @@ def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024 T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer = T.buffer_decl([848], "uint8") buffer_1 = T.buffer_decl([160], "uint8") - T.preflattened_buffer(placeholder_3, [1, 10, 12, 8], 'int8', data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 16], 'int8', data=ethosu_write_1.data) # body T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -656,8 +642,6 @@ def main(placeholder_3: T.Buffer[(315,), "int8"], ethosu_write_1: T.Buffer[(240, T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer = T.buffer_decl([160], "uint8") buffer_1 = T.buffer_decl([656], "uint8") - T.preflattened_buffer(placeholder_3, [1, 7, 9, 5], 'int8', data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 3, 5, 16], 'int8', data=ethosu_write_1.data) # body T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -700,8 +684,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768, T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer = T.buffer_decl([160], "uint8") buffer_1 = T.buffer_decl([848], "uint8") - T.preflattened_buffer(placeholder_3, [4, 6, 8, 1], 'int8', data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data) # body T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -716,8 +698,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768, T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer = T.buffer_decl([160], "uint8") buffer_1 = T.buffer_decl([848], "uint8") - T.preflattened_buffer(placeholder_3, [1, 24, 8], 'int8', data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data) # body T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -732,8 +712,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768, T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer = T.buffer_decl([160], "uint8") buffer_1 = T.buffer_decl([848], "uint8") - T.preflattened_buffer(placeholder_3, [192, 1], 'int8', data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data) # body T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -748,8 +726,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768, T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer = T.buffer_decl([160], "uint8") buffer_1 = T.buffer_decl([848], "uint8") - T.preflattened_buffer(placeholder_3, [192], 'int8', data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data) # body T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py index 932df71d24029..6b97b38d80e6d 100644 --- a/tests/python/contrib/test_ethosu/test_replace_copy.py +++ b/tests/python/contrib/test_ethosu/test_replace_copy.py @@ -34,16 +34,11 @@ class ReferenceModule: def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([80], "uint8") - buffer_1 = T.buffer_decl([304], "uint8") - T.preflattened_buffer(placeholder_3, [1, 16, 16, 32], dtype="int8", data=placeholder_3.data) - T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 8], dtype="int8", data=ethosu_write_1.data) + buffer_1 = T.buffer_decl([384], "uint8") # body - placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True}) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + placeholder_global = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True}) + T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 384, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_global[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -80,23 +75,15 @@ class WeightStream: def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([416], "uint8") - buffer_1 = T.buffer_decl([112], "uint8") - buffer_2 = T.buffer_decl([272], "uint8") - buffer_3 = T.buffer_decl([64], "uint8") - T.preflattened_buffer(placeholder_5, [1, 16, 16, 32], dtype="int8", data=placeholder_5.data) - T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data) + buffer = T.buffer_decl([528], "uint8") + buffer_2 = T.buffer_decl([336], "uint8") # body - placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global_unrolled_iter_1 = T.allocate([64], "uint8", "global", annotations={"disable_lower_builtin": True}) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + placeholder_d_global = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_d_global_1 = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True}) + T.evaluate(T.call_extern("ethosu_copy", buffer[0], 528, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 336, placeholder_d_global_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global[416], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_1[272], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py index 4baea26e591ef..ba050de2b4731 100644 --- a/tests/python/contrib/test_ethosu/test_scheduler.py +++ b/tests/python/contrib/test_ethosu/test_scheduler.py @@ -182,24 +182,16 @@ class DiamondGraphTir: @T.prim_func def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None: T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data) - buffer1 = T.buffer_decl([2608], "uint8") - buffer2 = T.buffer_decl([240], "uint8") - buffer3 = T.buffer_decl([736], "uint8") - buffer4 = T.buffer_decl([240], "uint8") - p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) - p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True}) - p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + buffer1 = T.buffer_decl([2848], "uint8") + buffer3 = T.buffer_decl([976], "uint8") + p1 = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True}) p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p2[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, T.int8(-1), T.int8(-1), 12, p4[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2848, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 976, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p1[2608], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p2[0], 736, T.int8(-1), T.int8(-1), 12, p2[736], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on From c1706a933e93737f66e6589941f878e922d38197 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Tue, 12 Jul 2022 09:46:33 -0700 Subject: [PATCH 094/111] [Collage] PartitionRule (though without CombinePartitionRule) (#11993) * [Collage] PartitionRule (though without CombinePartitionRule) See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md. (Special thanks to Matthew Barrett for authoring partition_rule_test.cc and suggesting a PR partitioning strategy.) Collage uses a small 'combinator library' of PartitionRule to decribe how candidate partitions can be extracted from a model for measurement and comparison. This introduces most of that machinery, however we defer the all important 'CombinerPartitionRule' for the next PR. Thus the rules at this stage can only express the sorts of DFPattern-based rules we find in most BYOC integrations, and cannot describe rules more traditionally associated with operator fusion. Based on #11981. * - Backport improvements to partiton_rule_test.cc * - Oops --- src/relay/collage/candidate_partition.cc | 258 ++++++++++++ src/relay/collage/candidate_partition.h | 180 +++++++++ src/relay/collage/candidate_set.cc | 76 ++++ src/relay/collage/candidate_set.h | 99 +++++ src/relay/collage/cost.cc | 45 +++ src/relay/collage/cost.h | 103 +++++ src/relay/collage/partition_rule.cc | 372 ++++++++++++++++++ src/relay/collage/partition_rule.h | 355 +++++++++++++++++ src/relay/collage/partition_spec.cc | 87 ++++ src/relay/collage/partition_spec.h | 120 ++++++ .../cpp/relay/collage/partition_rule_test.cc | 303 ++++++++++++++ 11 files changed, 1998 insertions(+) create mode 100644 src/relay/collage/candidate_partition.cc create mode 100644 src/relay/collage/candidate_partition.h create mode 100644 src/relay/collage/candidate_set.cc create mode 100644 src/relay/collage/candidate_set.h create mode 100644 src/relay/collage/cost.cc create mode 100644 src/relay/collage/cost.h create mode 100644 src/relay/collage/partition_rule.cc create mode 100644 src/relay/collage/partition_rule.h create mode 100644 src/relay/collage/partition_spec.cc create mode 100644 src/relay/collage/partition_spec.h create mode 100644 tests/cpp/relay/collage/partition_rule_test.cc diff --git a/src/relay/collage/candidate_partition.cc b/src/relay/collage/candidate_partition.cc new file mode 100644 index 0000000000000..9cccdf96d5a49 --- /dev/null +++ b/src/relay/collage/candidate_partition.cc @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_partition.cc + * \brief A potential partition in the Collage search. + */ + +#include "./candidate_partition.h" + +#include + +#include "./candidate_set.h" +#include "./partition_rule.h" +#include "./partition_spec.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +TVM_REGISTER_NODE_TYPE(CandidatePartitionNode); + +void CandidatePartitionNode::VisitAttrs(AttrVisitor* v) { + v->Visit("rule_name", &rule_name_); + v->Visit("sub_graph", &sub_graph_); + v->Visit("spec", &spec_); + // TODO(mbs): cost_ +} + +PartitionSpec CandidatePartitionNode::partition_spec() const { + return Downcast(spec_); +} + +std::string CandidatePartitionNode::partition_spec_name() const { + return Downcast(spec_)->spec_name_; +} + +Target CandidatePartitionNode::target() const { return Downcast(spec_)->target_; } + +std::string CandidatePartitionNode::ToSummary(const DataflowGraph& dataflow_graph) const { + std::ostringstream os; + os << sub_graph_->label_; + os << " | ("; + bool first = true; + for (PostDfsIndex index : sub_graph_->input_) { + Expr sub_expr = dataflow_graph.index_to_node(index)->ref(); + if (CanInline(sub_expr)) { + continue; + } + if (first) { + first = false; + } else { + os << ", "; + } + os << PrettyPrint(sub_expr->checked_type()); + } + os << ") -> ("; + first = true; + for (PostDfsIndex index : sub_graph_->exit_) { + Expr sub_expr = dataflow_graph.index_to_node(index)->ref(); + if (CanInline(sub_expr)) { + continue; + } + if (first) { + first = false; + } else { + os << ", "; + } + os << PrettyPrint(sub_expr->checked_type()); + } + os << ") | "; + os << sub_graph_->inside_.ToString(); + os << " | "; + os << partition_spec_name(); + os << " | "; + os << cost_.ToString(); + return os.str(); +} + +std::string CandidatePartitionNode::ToString() const { + std::ostringstream os; + os << "{rule_name=" << rule_name_; + os << ",sub_graph=" << sub_graph_->ToString(); + os << ",spec_name=" << partition_spec_name(); + if (!cost_.is_unknown()) { + os << ",cost=" << cost_.ToString(); + } + os << "}"; + return os.str(); +} + +CandidatePartition::CandidatePartition(String rule_name, SubGraph sub_graph, + ObjectRef /* actually PartitionSpec */ spec, Cost cost) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_graph_ = std::move(sub_graph); + node->spec_ = std::move(spec); + node->cost_ = cost; + data_ = std::move(node); +} + +CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name) { + if (rule_name == candidate->rule_name_) { + return candidate; + } + auto* node = candidate.CopyOnWrite(); + node->rule_name_ = std::move(rule_name); + return GetRef(node); +} + +CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph) { + if (sub_graph == candidate->sub_graph_) { + return candidate; + } + auto* node = candidate.CopyOnWrite(); + node->sub_graph_ = std::move(sub_graph); + return GetRef(node); +} + +bool CandidatePartition::operator<(const CandidatePartition& that) const { + // Order lexicographically on sub-graphs. + if (*get()->sub_graph_.get() < *that->sub_graph_.get()) { + return true; + } + if (*that->sub_graph_.get() < *get()->sub_graph_.get()) { + return false; + } + // Break ties by rule name. + return get()->rule_name_ < that->rule_name_; +} + +bool CandidatePartition::AreTouching(const DataflowGraph& dataflow_graph, + const CandidatePartition& that) const { + return get()->spec_ == that->spec_ && + get()->sub_graph_.AreTouching(dataflow_graph, that->sub_graph_); +} + +CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph, + const CandidatePartition& that) const { + ICHECK_EQ(get()->spec_, that->spec_); + return CandidatePartition(UnionLabels(get()->rule_name_, that->rule_name_), + get()->sub_graph_.DisjointUnion(dataflow_graph, that->sub_graph_), + get()->spec_, get()->cost_ + that->cost_); +} + +/*static*/ +CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph, + std::vector candidates) { + ICHECK_GT(candidates.size(), 1); + CandidatePartition result = candidates.front(); + for (size_t i = 1; i < candidates.size(); ++i) { + result = result.DisjointUnion(dataflow_graph, candidates[i]); + } + return result; +} + +/*static*/ +Expr CandidatePartition::ParallelRewrite(const DataflowGraph& dataflow_graph, + const std::vector& candidates) { + std::vector sub_graphs; + sub_graphs.reserve(candidates.size()); + for (const auto& candidate : candidates) { + sub_graphs.emplace_back(candidate->sub_graph_); + } + return SubGraph::ParallelRewrite(dataflow_graph, sub_graphs); +} + +/*static*/ +std::vector CandidatePartition::MaxCoalesce( + const DataflowGraph& dataflow_graph, std::vector candidates) { + VLOG(1) << "Running MaxCoalesce over " << candidates.size() << " candidates"; + // This is an eager version of using the simple (kOpaque, kOpaque) combiner. + + // Switch to set representation. + CandidateSet result_set(std::move(candidates)); + + // Until fixed point... + size_t num_rounds = 0; + while (result_set.PrepareForNextRound()) { + VLOG_CONTEXT << "round " << ++num_rounds; + VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index() + << " existing)"; + IndexSet removed_this_round(result_set.size()); // over candidate indexes! + + // Build map from post-dfs indices to the indices of candidates with corresponding entry node. + // NOTE: the index set is over candidate indices not post-dfs indices! + std::vector entry_map(dataflow_graph.size(), IndexSet(result_set.size())); + for (size_t i = 0; i < result_set.size(); ++i) { + CandidatePartition candidate = result_set.at(i); + for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) { + entry_map[entry_index].Add(i); + } + } + + for (size_t i = 0; i < result_set.size(); ++i) { + if (removed_this_round[i]) { + // Already merged. + continue; + } + CandidatePartition upstream = result_set.at(i); + // Narrow our search to just those candidates which could touch. + IndexSet possible_downstream(result_set.size()); // over candidate indexes! + for (PostDfsIndex output_index : upstream->sub_graph_->output_) { + possible_downstream = possible_downstream | entry_map[output_index]; + } + for (size_t j : possible_downstream) { + if (removed_this_round[j]) { + // Already merged. + continue; + } + if (i == j) { + // Ignore self. + continue; + } + CandidatePartition downstream = result_set.at(j); + if (!upstream.AreTouching(dataflow_graph, downstream)) { + continue; + } + CandidatePartition new_candidate = upstream.DisjointUnion(dataflow_graph, downstream); + VLOG(2) << "Merging upstream candidate " << upstream->ToString() + << " and downstream candidate " << downstream->ToString() << " to yield " + << new_candidate->ToString(); + result_set.Add(dataflow_graph, new_candidate); + result_set.Remove(upstream); + removed_this_round.Add(i); + result_set.Remove(downstream); + removed_this_round.Add(j); + } + } + } + + // Restore canonical order. + result_set.sort(); + + VLOG(1) << "MaxCoalesce produced " << result_set.size() << " candidates"; + return result_set.MovedCurrentCandidates(); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/candidate_partition.h b/src/relay/collage/candidate_partition.h new file mode 100644 index 0000000000000..1265087f475f0 --- /dev/null +++ b/src/relay/collage/candidate_partition.h @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_partition.cc + * \brief A potential partition in the Collage search. + */ + +#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_ +#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_ + +#include +#include + +#include +#include +#include + +#include "./cost.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +class PartitionSpec; + +/*! + * \brief A candidate partition w.r.t. the overall Relay model. + * + * We represent the partition as a sub-graph. This means not only can we represent the scope + * of Relay sub-expressions intended for a particular partition (or kernel), but we can also + * represent various conventions for encoding how the operators within the partition should be + * tagged for downstream processing. + */ +class CandidatePartitionNode : public Object { + public: + CandidatePartitionNode() = default; + + /*! + * \brief Combination of all the partition rule names which produced this candidate. + * For debugging and explainability. + */ + String rule_name_; + + /*! + * \brief The sub-graph of the overall expression matched by the partition rule. + */ + SubGraph sub_graph_; + + /*! + * \brief The partition specification which produced this candidate. + */ + ObjectRef /* actually PartitionSpec */ spec_; + + /*! + * \brief The (cached) cost of the partition. + * + * Initially Cost::Unknown, calculated and cached by EstimateCost. + */ + mutable Cost cost_ = Cost::Unknown(); + + void VisitAttrs(AttrVisitor* v); + + /*! + * \brief Returns the partition specification which produced this candidate. + */ + PartitionSpec partition_spec() const; + + /*! + * \brief Returns the name of the partition specification which produced this candidate. + */ + std::string partition_spec_name() const; + + /*! + * \brief Returns the target of the partition specification which produced this candidate. + */ + Target target() const; + + /*! + * \brief Returns a brief description of candidate suitable for debugging output. + */ + std::string ToSummary(const DataflowGraph& dataflow_graph) const; + + std::string ToString() const; + + static constexpr const char* _type_key = "relay.collage.CandidatePartition"; + TVM_DECLARE_FINAL_OBJECT_INFO(CandidatePartitionNode, Object); +}; + +class CandidatePartition : public ObjectRef { + public: + CandidatePartition(String rule_name, SubGraph sub_graph, + ObjectRef /* actually PartitionSpec */ spec, Cost cost = Cost::Unknown()); + + bool operator<(const CandidatePartition& that) const; + + /*! + * \brief Returns true if this and \p that candidate are disjoint, have the same (or no) target, + * and touch. This does not imply the \p DisjointUnion of this and that will be valid. For + * example, the result may be too deep or have too many outputs. + */ + bool AreTouching(const DataflowGraph& dataflow_graph, const CandidatePartition& that) const; + + /*! + * \brief Returns the disjoint union of this and \p that. + */ + CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph, + const CandidatePartition& that) const; + + /*! + * \brief Returns the disjoint union of all \p candidates. + */ + static CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph, + std::vector candidates); + + /*! + * \brief Returns the root expression of \p dataflow_graph rewritten to apply all the partitions + * implied by \p candidates. The candidates can be in any order but must be disjoint. + */ + static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, + const std::vector& candidates); + + /*! + * Eagerly merge all touching candidates for the same target. The candidates must be disjoint + * and have their Targets filled in. This is typically called on the optimal list of candidate + * partitions found by the Collage search in order to remove unnecessary partition boundaries. + * Ideally the search would never produce such candidates however to keep the search space + * manageable Collage may only consider candidate partitions up to a particular depth. + */ + static std::vector MaxCoalesce(const DataflowGraph& dataflow_graph, + std::vector candidates); + + TVM_DEFINE_OBJECT_REF_METHODS(CandidatePartition, ObjectRef, CandidatePartitionNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(CandidatePartitionNode); +}; + +CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name); +CandidatePartition WithTarget(CandidatePartition candidate, Target target); +CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph); + +struct CandidatePartitionHash { + size_t operator()(const CandidatePartition& candidate) const { + return candidate->sub_graph_->hash(); + } +}; + +struct CandidatePartitionEquals { + bool operator()(const CandidatePartition& left, const CandidatePartition& right) const { + return *left->sub_graph_.get() == *right->sub_graph_.get(); + } +}; + +struct CandidatePartitionCompare { + bool operator()(const CandidatePartition& left, const CandidatePartition& right) const { + return *left->sub_graph_.get() < *right->sub_graph_.get(); + } +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_ diff --git a/src/relay/collage/candidate_set.cc b/src/relay/collage/candidate_set.cc new file mode 100644 index 0000000000000..2c2a7eaf8d540 --- /dev/null +++ b/src/relay/collage/candidate_set.cc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_set.cc + * \brief Collects a set of candidate partitions. + */ + +#include "./candidate_set.h" + +namespace tvm { +namespace relay { +namespace collage { + +CandidateSet::CandidateSet(std::vector candidates_to_add) + : candidates_to_add_(std::move(candidates_to_add)) { + for (const auto& candidate : candidates_to_add_) { + seen_.emplace(candidate); + } +} + +void CandidateSet::Add(const DataflowGraph& dataflow_graph, + const CandidatePartition& new_candidate) { + VLOG(2) << "adding " << new_candidate->ToString(); + if (seen_.count(new_candidate)) { + VLOG(2) << "already seen candidate, ignoring"; + return; + } + seen_.emplace(new_candidate); + candidates_to_add_.emplace_back(new_candidate); +} + +void CandidateSet::Remove(const CandidatePartition& old_candidate) { + ICHECK(seen_.count(old_candidate)); + VLOG(2) << "removing " << old_candidate->ToString(); + candidates_to_remove_.emplace_back(old_candidate); +} + +bool CandidateSet::PrepareForNextRound() { + size_t init_size = current_candidates_.size(); + for (const auto& candidate_to_remove : candidates_to_remove_) { + current_candidates_.erase( + std::remove(current_candidates_.begin(), current_candidates_.end(), candidate_to_remove), + current_candidates_.end()); + } + size_t num_removed = init_size - current_candidates_.size(); + candidates_to_remove_.clear(); + first_new_index_ = current_candidates_.size(); + for (const auto& new_candidate : candidates_to_add_) { + current_candidates_.push_back(new_candidate); + } + size_t num_added = candidates_to_add_.size(); + candidates_to_add_.clear(); + VLOG(1) << "removed " << num_removed << " and added " << num_added << " candidates"; + return num_removed + num_added > 0; +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/candidate_set.h b/src/relay/collage/candidate_set.h new file mode 100644 index 0000000000000..4cb2c40e9500e --- /dev/null +++ b/src/relay/collage/candidate_set.h @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/candidate_set.h + * \brief Collects a set of candidate partitions. + */ + +#ifndef TVM_RELAY_COLLAGE_CANDIDATE_SET_H_ +#define TVM_RELAY_COLLAGE_CANDIDATE_SET_H_ + +#include +#include +#include +#include + +#include "./candidate_partition.h" +#include "./dataflow_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Holds a vector of current candidates and the additions/removals to apply to them. + */ +struct CandidateSet { + CandidateSet() = default; + + explicit CandidateSet(std::vector candidates_to_add); + + /*! + * \brief Schedule \p new_candidate for addition before the next round (unless it is not valid). + */ + void Add(const DataflowGraph& dataflow_graph, const CandidatePartition& new_candidate); + + /*! \brief Schedule \p old_candidate for removal before the next round. */ + void Remove(const CandidatePartition& old_candidate); + + /*! + * \brief Update \p current_candidates and \p first_new_index. Return false if no + * new candidates were added, in which case we have reached a fixed point. + */ + bool PrepareForNextRound(); + + size_t size() const { return current_candidates_.size(); } + + CandidatePartition operator[](size_t i) const { + ICHECK_LT(i, current_candidates_.size()); + return current_candidates_[i]; + } + CandidatePartition at(size_t i) const { return (*this)[i]; } + + size_t first_new_index() const { return first_new_index_; } + + void sort() { std::sort(current_candidates_.begin(), current_candidates_.end()); } + + std::vector MovedCurrentCandidates() { + return std::move(current_candidates_); + } + + private: + /*! + * \brief Index of first candidate in current_candidates added in last round. This can be used to + * avoid considering candidates or candidate combinations which have already been considered in an + * earlier round. + */ + size_t first_new_index_ = 0; + /*! \brief Candidates gathered in previous rounds. */ + std::vector current_candidates_; + /*! \brief New candidates gathered in the current round. */ + std::vector candidates_to_add_; + /*! \brief Existing candidates to remove before starting the next round. */ + std::vector candidates_to_remove_; + /*! \brief Which candidates have been seen so far and should not be added again. */ + std::unordered_set seen_; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_CANDIDATE_SET_H_ diff --git a/src/relay/collage/cost.cc b/src/relay/collage/cost.cc new file mode 100644 index 0000000000000..ae2eb8600ebd0 --- /dev/null +++ b/src/relay/collage/cost.cc @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/cost.cc + * \brief Represents the estimated cost of a candidate partition. + */ + +#include "./cost.h" + +namespace tvm { +namespace relay { +namespace collage { + +std::string Cost::ToString() const { + if (is_invalid()) { + return "invalid"; + } else if (is_unknown()) { + return "unknown"; + } else if (value_ == 0.0) { + return "0"; + } else { + return std::to_string(value_ * 1e6) + "us"; + } +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/cost.h b/src/relay/collage/cost.h new file mode 100644 index 0000000000000..8ae276d22078f --- /dev/null +++ b/src/relay/collage/cost.h @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/cost.h + * \brief Represents the estimated cost of a candidate partition. + */ +#ifndef TVM_RELAY_COLLAGE_COST_H_ +#define TVM_RELAY_COLLAGE_COST_H_ + +#include + +#include +#include +#include + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief The assumed cost for a candidate partition. Generally average execution time in seconds. + * However other cost functions are possible, for example to introduce a penalty for high memory + * use, etc. + */ +class Cost { + public: + Cost() = delete; + + static Cost Zero() { return Cost(0.0); } + + /*! + * \brief Returns the distinguished 'invalid' cost signaling a candidate partition is not + * supported by the intended target, for example because the sub-graph has an unsupported operator + * or the intermediate memory required exceeds some system limit. + */ + static Cost Invalid() { return Cost(std::numeric_limits::infinity()); } + + bool is_invalid() const { return std::isinf(value_) && value_ > 0.0; } + + /*! + * \brief Returns the distinguished 'unknown' cost, signaling fixed priorities should be used to + * choose the best partitions. This can be used to disable tuning and fallback to fixed rules, + * much as TVM will use an un-tuned kernel if no tuning records are available. + */ + static Cost Unknown() { return Cost(std::numeric_limits::quiet_NaN()); } + + bool is_unknown() const { return std::isnan(value_); } + + /*! \brief Returns cost with given finite, non-negative value. */ + static Cost Value(double value) { + ICHECK(!std::isnan(value) && !std::isinf(value) && value >= 0.0); + return Cost(value); + } + + bool is_value() const { return !std::isnan(value_) && !std::isinf(value_); } + + /*! \brief Return true if the less-than relation is defined for this and that. */ + bool are_comparable(Cost that) const { return !std::isnan(value_) && !std::isnan(that.value_); } + + /*! \brief Returns sum of this and that. */ + Cost operator+(Cost that) const { return Cost(value_ + that.value_); } + + /*! \brief Returns difference of this and that. */ + Cost operator-(Cost that) const { return Cost(value_ - that.value_); } + + /*! \brief Returns true if this is cheaper than that, assuming they are comparable. */ + bool operator<(Cost that) const { return value_ < that.value_; } + + std::string ToString() const; + + private: + explicit Cost(double value) : value_(value) {} + + /*! + * \brief Non-negative value or: + * - +inf if candidate partition is not feasible. + * - NaN if candidate partition has an unknown cost (priority may be used to break ties). + */ + double value_ = 0.0; +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_COST_H_ diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc new file mode 100644 index 0000000000000..1cedbfc9d72c4 --- /dev/null +++ b/src/relay/collage/partition_rule.cc @@ -0,0 +1,372 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_rule.cc + * \brief Compositional partitioning rules. + */ + +#include "./partition_rule.h" + +#include + +#include "./partition_rule.h" +#include "./partition_spec.h" +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +TVM_REGISTER_NODE_TYPE(PartitionRuleNode); + +void PartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector PartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + ICHECK(false) << "PartitionRuleNode::AllCandidates should be overridden in sub-class"; + return {}; +} + +std::string PartitionRuleNode::ToString() const { return ToDoc().str(); } + +Doc PartitionRuleNode::ToDoc() const { + Doc doc; + doc << GetTypeKey() << "(" << Doc::NewLine(2); + std::vector body_items; + AppendBodyItems(&body_items); + doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine(); + doc << ")"; + return doc; +} + +void PartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + body_items->emplace_back(); + body_items->back() << "rule_name=" << Doc::StrLiteral(rule_name_); +} + +PartitionRule::PartitionRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +bool DefaultPatternPredicate(const Expr& matched_sub_expr) { return true; } + +TVM_REGISTER_NODE_TYPE(DFPatternPartitionRuleNode); + +void DFPatternPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector DFPatternPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + VLOG(1) << "running DFPatternPartitionRule(" << rule_name_ << ")"; + std::vector result; + DFPatternMatcher matcher(&dataflow_graph.indexed_graph()); + for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) { + Expr sub_expr = dataflow_graph.index_to_node(index)->ref(); + if (!matcher.Match(pattern_, sub_expr)) { + continue; + } + if (!predicate_(sub_expr)) { + VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") has failing predicate"; + continue; + } + IndexSet inside = MatcherToIndexSet(matcher); + OpPatternKind kind; + String label; + std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label)); + String rule_name = rule_name_.empty() ? sub_graph->label_ : rule_name_; + CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec); + VLOG(2) << "DFPatternPartitionRule(" << rule_name_ << ") yields " << candidate->ToString(); + result.emplace_back(std::move(candidate)); + } + VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void DFPatternPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "pattern=" << PrettyPrint(pattern_); +} + +DFPatternPartitionRule::DFPatternPartitionRule(String rule_name, DFPattern pattern, + TPatternPredicate predicate) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->pattern_ = std::move(pattern); + node->predicate_ = std::move(predicate); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(CompositePartitionRuleNode); + +void CompositePartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector CompositePartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running CompositePartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + std::vector result; + FunctionAttrsMap attrs; + attrs.Set(attr::kComposite, rule_name_); + for (auto& candidate : candidates) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs); + CandidatePartition new_candidate = WithSubGraph( + WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph)); + VLOG(2) << "CompositePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "CompositePartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void CompositePartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); +} + +CompositePartitionRule::CompositePartitionRule(String rule_name, PartitionRule sub_rule) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(PrimitivePartitionRuleNode); + +void PrimitivePartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector PrimitivePartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running PrimitivePartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + std::vector result; + FunctionAttrsMap attrs; + attrs.Set(attr::kPrimitive, Integer(1)); + if (spec->target_.IsExternalCodegen()) { + // The spec name will be the target kind name which is 1:1 with the "Compiler" attribute name. + attrs.Set(attr::kCompiler, spec->spec_name_); + } + for (auto& candidate : candidates) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs); + CandidatePartition new_candidate = WithSubGraph( + WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph)); + VLOG(2) << "PrimitivePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "PrimitivePartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void PrimitivePartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); +} + +PrimitivePartitionRule::PrimitivePartitionRule(String rule_name, PartitionRule sub_rule) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(UnionPartitionRuleNode); + +void UnionPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector UnionPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector result; + for (const auto& sub_rule : sub_rules_) { + std::vector candidates = sub_rule->AllCandidates(dataflow_graph, spec); + for (auto& candidate : candidates) { + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name)); + VLOG(2) << "UnionPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + } + VLOG(1) << "UnionPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates"; + return result; +} + +void UnionPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + for (const auto& sub_rule : sub_rules_) { + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule->ToDoc(); + } +} + +UnionPartitionRule::UnionPartitionRule(String rule_name, Array sub_rules) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rules_ = std::move(sub_rules); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(OpCallByKindPartitionRuleNode); + +void OpCallByKindPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector OpCallByKindPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + VLOG(1) << "running OpCallByKindPartitionRule(" << rule_name_ << ")"; + std::vector result; + for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) { + auto node = dataflow_graph.index_to_node(index); + Expr sub_expr = node->ref(); + if (sub_expr->IsInstance()) { + OpPatternKind kind; + String label; + std::tie(kind, label) = SubExprKindAndLabel(sub_expr); + if (kind <= kOutEWiseFusable) { + IndexSet inside(dataflow_graph.size(), {index}); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label)); + String rule_name = NestLabels(rule_name_, sub_graph->label_); + CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec); + VLOG(2) << "OpCallByKindPartitionRule(" << rule_name_ << ") yields " + << candidate->ToString(); + result.emplace_back(std::move(candidate)); + } + } + } + VLOG(1) << "OpCallByKindPartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void OpCallByKindPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); +} + +OpCallByKindPartitionRule::OpCallByKindPartitionRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(OnlyValidPartitionRuleNode); + +void OnlyValidPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector OnlyValidPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + std::vector candidates = sub_rule_->AllCandidates(dataflow_graph, spec); + VLOG(1) << "running OnlyValidPartitionRule(" << rule_name_ << ") over " << candidates.size() + << " sub-candidates"; + std::vector result; + for (auto& candidate : candidates) { + if (!candidate->sub_graph_->IsValid(dataflow_graph, config_)) { + VLOG(2) << "Ignoring invalid candidate " << candidate->ToString(); + continue; + } + String rule_name = NestLabels(rule_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name)); + VLOG(2) << "OnlyValidPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString(); + result.emplace_back(std::move(new_candidate)); + } + VLOG(1) << "OnlyValidPartitionRule(" << rule_name_ << ") produced " << result.size() + << " candidates"; + return result; +} + +void OnlyValidPartitionRuleNode::AppendBodyItems(std::vector* body_items) const { + PartitionRuleNode::AppendBodyItems(body_items); + body_items->emplace_back(); + body_items->back() << "sub_rule=" << sub_rule_->ToDoc(); + body_items->emplace_back(); + body_items->back() << "config=" << config_.ToString(); +} + +OnlyValidPartitionRule::OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule, + const SubGraphConfig& config) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + node->sub_rule_ = std::move(sub_rule); + node->config_ = config; + data_ = std::move(node); +} + +TVM_REGISTER_NODE_TYPE(HostPartitionRuleNode); + +void HostPartitionRuleNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector HostPartitionRuleNode::AllCandidates( + const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const { + VLOG(1) << "running HostPartitionRule(" << rule_name_ << ")"; + std::vector result; + for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) { + if (MustBeLowered(dataflow_graph.index_to_node(index)->ref())) { + continue; + } + IndexSet inside(dataflow_graph.size(), {index}); + OpPatternKind kind; + String label; + std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + SubGraph sub_graph(dataflow_graph, std::move(inside), kind, label); + String rule_name = NestLabels(rule_name_, sub_graph->label_); + // We'll a zero cost for the candidate since we'll never want to actually estimate the cost + // of this 'partition'. + CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec, Cost::Zero()); + VLOG(2) << "HostPartitionRule(" << rule_name_ << ") yields " << candidate->ToString(); + result.push_back(candidate); + } + VLOG(1) << "HostPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates"; + return result; +} + +void HostPartitionRuleNode::AppendBodyItems(std::vector* body_items) const {} + +HostPartitionRule::HostPartitionRule(String rule_name) { + auto node = runtime::make_object(); + node->rule_name_ = std::move(rule_name); + data_ = std::move(node); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/partition_rule.h b/src/relay/collage/partition_rule.h new file mode 100644 index 0000000000000..13f5c0b01d318 --- /dev/null +++ b/src/relay/collage/partition_rule.h @@ -0,0 +1,355 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_rule.h + * \brief Compositional partitioning rules. + */ + +#ifndef TVM_RELAY_COLLAGE_PARTITION_RULE_H_ +#define TVM_RELAY_COLLAGE_PARTITION_RULE_H_ + +#include +#include + +#include +#include + +#include "../../printer/doc.h" +#include "./candidate_partition.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Type of function to check if a matched sub-expression should be accepted by a rule. This + * can be used to, eg, reject operators of unsupported shape or dtype, or otherwise implement rules + * which are difficult to express in the dataflow pattern language directly. + */ +using TPatternPredicate = TypedPackedFunc; + +/*! + * \brief The default pattern predicate. Always returns true. + */ +bool DefaultPatternPredicate(const Expr& matched_sub_expr); + +/*! + * \brief Base class of all partition rules. + * + * A \p PartitionRule describes how to find a set of \p CandidatePartitions for a \p DataflowGraph. + * The candidates are allowed to overlap, and ultimately it is the job of the Collage searcher to + * find a selection of candidates which covers the whole Relay expression without overlap. Partition + * rules are paired with their \p Target and other 'top level' configuration in a \p PartitionSpec. + * + * We provide a set of 'base' partition rules which produce candidates from the dataflow graph + * directly. We also provide a set of 'combinator' partition rules which can produce new candidates + * from the results of an arbitrary sub-rule or sub-rules. By mixing these base and combinator + * rules we can express a wide variety of partition strategies and encoding conventions. + * + * There may be many thousands of candidates in flight during the Collage search. We take care to + * defer constructing or rewriting Relay expressions until absolutely necessary. We only pay for + * extracting a function to represent a candidate when we need to measure it's cost. And we only + * pay for rewriting the overall Relay expression to commit to a partitioning when the Collage + * search has completed. + * + * The base rules implemented so far: + * - \p DFPatternPartitionRule: Given a \p DFPattern and expression predicate, produces a candidate + * for every sub-graph matched by the pattern and predicate. Unlike the \p PatternRewriter, + * candidates are free to overlap. Used to bring BYOC patterns into the Collage framework. + * - \p OpCallByKindPartitionRule: Uses the "TOpPattern" attribute provided for every Relay + * operator to produce a candidate for every call to a 'fusable Relay operator'. Used to + * look ahead to how TVM will fuse sub-graphs. + * + * The combinator rules implemented so far: + * - \p CompositePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped + * by a "Composite" function. The "Composite" name is taken from the rule name. Used to indicate + * Relay operators (or groups of Relay operators) should be mapped to target-specific operators, + * both for BYOC and TVM external library integrations. + * - \p PrimitivePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped + * by a "Primitive" function, possibly with an additional "Compiler" attribute. Used to + * delineate a partition (or kernel). + * - \p UnionPartitionRule: Simply unions all the candidates from all sub-rules together. Used to + * combine individual \p DFPatternPartitionRules. + * - \p OnlyValidPartitionRule: Given a \p SubGraphConfig, ignores candidates with 'invalid' + * sub-graphs. Used to limit the maximum candidate depth, the number of independent outputs, + * and whether intermediate 'taps' are allowed. + * - \p HostPartitionRule: Produces candidates for all Relay expressions which could be + * 'left behind' for execution by the host (eg on the VM). This rule lets us simplify the + * overall Collage search algorithm. + * + * (Though not yet implemented, we'd like to allow a combinator rule which will union candidate + * based on their 'anchor' operators. This can be used to implement 'vertical' and 'horizontal' + * partition on more primitive candidates. Note that the \p SubGraph machinery supports + * multiple-input and -output sub-graphs and their validation, so horizontal partition is easy + * implement.) + */ +class PartitionRuleNode : public Object { + public: + /*! + * \brief A unique (over all rules for the same target) name for the rule. Rule names are + * combined and captured with \p PartitionCandidate rule names for debuggability and + * explainability. Some rules will copy the rule name into function attributes. + * + */ + String rule_name_; + + void VisitAttrs(AttrVisitor* v); + + /*! + * \brief Returns all the possible candidate partitions according to this rule for the overall + * expression corresponding to \p dataflow_graph. The candidates will generally have unknown + * target and cost: the target will be filled in by the \p PartitionSpec, while the cost will + * be filled in lazily. + */ + virtual std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const; + + std::string ToString() const; + Doc ToDoc() const; + + protected: + virtual void AppendBodyItems(std::vector* body_items) const; + + public: + static constexpr const char* _type_key = "relay.collage.PartitionRule"; + static constexpr const uint32_t _type_child_slots = 10; + TVM_DECLARE_BASE_OBJECT_INFO(PartitionRuleNode, Object); +}; + +class PartitionRule : public ObjectRef { + public: + explicit PartitionRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(PartitionRule, ObjectRef, PartitionRuleNode); +}; + +/*! + * \brief Partition rule which fires on all sub-expressions matching a dataflow-pattern and pattern + * predicate. It is valid for matching candidates to overlap. + */ +class DFPatternPartitionRuleNode : public PartitionRuleNode { + public: + /*! + * \brief Relay pattern. + */ + DFPattern pattern_; + + /*! + * \brief Predicate on matched sub-expression to decide if partition rule should fire. + */ + TPatternPredicate predicate_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.DFPatternPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(DFPatternPartitionRuleNode, PartitionRuleNode); +}; + +class DFPatternPartitionRule : public PartitionRule { + public: + DFPatternPartitionRule(String rule_name, DFPattern pattern, + TPatternPredicate predicate = DefaultPatternPredicate); + + TVM_DEFINE_OBJECT_REF_METHODS(DFPatternPartitionRule, PartitionRule, DFPatternPartitionRuleNode); +}; + +/*! + * \brief Partition rule which wraps candidates within a function with the "Composite" attribute + * bound to the given rule name. + * + * This is the standard way by which operators or operator groups are tagged as being supported + * by a particular externally provided function. It is up to the BYOC lowering function to + * recognize the "Composite" name and emit the appropriate code or call. + */ +class CompositePartitionRuleNode : public PartitionRuleNode { + public: + /*! \brief The sub-partition rule. */ + PartitionRule sub_rule_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.CompositePartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(CompositePartitionRuleNode, PartitionRuleNode); +}; + +class CompositePartitionRule : public PartitionRule { + public: + CompositePartitionRule(String rule_name, PartitionRule sub_rule); + + TVM_DEFINE_OBJECT_REF_METHODS(CompositePartitionRule, PartitionRule, CompositePartitionRuleNode); +}; + +/*! + * \brief Partition rule which wraps candidates within a function with the "Primitive" attribute + * bound to 1. If the partition spec target(s) have the "compiler" attribute then that name is + * also added to the function as a "Compiler" attribute. + * + * This is the standard way by which sub-graphs are marked as being in a 'partition' who's + * compilation will be managed by an external BYOC toolchain. It can also be used to mark + * sub-graphs for lowering to a single kernel by the built-in TVM lowering machinery. + */ +class PrimitivePartitionRuleNode : public PartitionRuleNode { + public: + /*! \brief The sub-partition rule. */ + PartitionRule sub_rule_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.PrimitivePartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(PrimitivePartitionRuleNode, PartitionRuleNode); +}; + +class PrimitivePartitionRule : public PartitionRule { + public: + PrimitivePartitionRule(String rule_name, PartitionRule sub_rule); + + TVM_DEFINE_OBJECT_REF_METHODS(PrimitivePartitionRule, PartitionRule, PrimitivePartitionRuleNode); +}; + +/*! + * \brief Partition rule which simply unions all matches from all sub-partition rules. + * + * This can be used to combine the results of a set of, eg, DFPatternPartitionRules. + */ +class UnionPartitionRuleNode : public PartitionRuleNode { + public: + Array sub_rules_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.UnionPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(UnionPartitionRuleNode, PartitionRuleNode); +}; + +class UnionPartitionRule : public PartitionRule { + public: + UnionPartitionRule(String rule_name, Array sub_rules); + + TVM_DEFINE_OBJECT_REF_METHODS(UnionPartitionRule, PartitionRule, UnionPartitionRuleNode) +}; + +/* + *! \brief Partition rule which places calls to Relay operators with a "TOpPattern" attribute of + * \p kOutEWiseFusable or less in their own singleton sub-graph. No other Relay sub-expressions + * (such as tuples or tuple projection) are selected, and it is up to outer partition rules to + * account for them. + */ +class OpCallByKindPartitionRuleNode : public PartitionRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + static constexpr const char* _type_key = "relay.collage.OpCallByKindPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(OpCallByKindPartitionRuleNode, PartitionRuleNode); +}; + +class OpCallByKindPartitionRule : public PartitionRule { + public: + explicit OpCallByKindPartitionRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(OpCallByKindPartitionRule, PartitionRule, + OpCallByKindPartitionRuleNode); +}; + +/*! + * \brief Partition rules which keeps only candidates from the sub-rule whose sub-groups are valid + * w.r.t. the given \p SubGraphConfig. + */ +class OnlyValidPartitionRuleNode : public PartitionRuleNode { + public: + PartitionRule sub_rule_; + SubGraphConfig config_; + + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + public: + static constexpr const char* _type_key = "relay.collage.OnlyValidPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(OnlyValidPartitionRuleNode, PartitionRuleNode); +}; + +class OnlyValidPartitionRule : public PartitionRule { + public: + OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule, const SubGraphConfig& config); + + TVM_DEFINE_OBJECT_REF_METHODS(OnlyValidPartitionRule, PartitionRule, OnlyValidPartitionRuleNode); +}; + +/*! + * \brief Partition rule which selects nodes which can be 'left behind' to be executed by the host + * (eg on the VM). This includes most of the 'interstitial' Relay constructs, such a let bindings, + * operators on references, calls to non-operator functions, and so on. It can also include the + * construction of and projection from tuples which may not be supported within a partition. + */ +class HostPartitionRuleNode : public PartitionRuleNode { + public: + void VisitAttrs(AttrVisitor* v); + + std::vector AllCandidates(const DataflowGraph& dataflow_graph, + const PartitionSpec& spec) const override; + + void AppendBodyItems(std::vector* body_items) const override; + + public: + static constexpr const char* _type_key = "relay.collage.HostPartitionRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(HostPartitionRuleNode, PartitionRuleNode); +}; + +class HostPartitionRule : public PartitionRule { + public: + explicit HostPartitionRule(String rule_name); + + TVM_DEFINE_OBJECT_REF_METHODS(HostPartitionRule, PartitionRule, HostPartitionRuleNode); +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_PARTITION_RULE_H_ diff --git a/src/relay/collage/partition_spec.cc b/src/relay/collage/partition_spec.cc new file mode 100644 index 0000000000000..b2095d0a594e5 --- /dev/null +++ b/src/relay/collage/partition_spec.cc @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_spec.cc + * \brief Combine a \p PartitionRule with a \p Target. + */ + +#include "./partition_spec.h" + +#include "./utils.h" + +namespace tvm { +namespace relay { +namespace collage { + +String DefaultValidateSubGraphFunc(const Function& function) { return String(); } + +TVM_REGISTER_NODE_TYPE(PartitionSpecNode); + +void PartitionSpecNode::VisitAttrs(AttrVisitor* v) { + // TODO(mbs) +} + +std::vector PartitionSpecNode::AllCandidates( + const DataflowGraph& dataflow_graph) const { + std::vector result; + // Make sure the target is in scope for inspection by any predicates in + // DFPatternPartitionRuleNode rules. + With target_scope(target_); + // Gather all the candidates. + std::vector candidates = + rule_->AllCandidates(dataflow_graph, GetRef(this)); + // Update the rules names. + for (const auto& candidate : candidates) { + ICHECK_EQ(candidate->spec_, GetRef(this)); + String rule_name = NestLabels(spec_name_, candidate->rule_name_); + CandidatePartition new_candidate = WithRuleName(candidate, std::move(rule_name)); + result.emplace_back(std::move(new_candidate)); + } + return result; +} + +std::string PartitionSpecNode::ToString() const { + Doc doc; + doc << "PartitionSpec(" << Doc::NewLine(2); + std::vector body_items; + body_items.emplace_back(); + body_items.back() << "spec_name=" << Doc::StrLiteral(spec_name_); + body_items.emplace_back(); + body_items.back() << "target=" << target_->ToDebugString(); + body_items.emplace_back(); + body_items.back() << "rule=" << rule_->ToDoc(); + doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine(); + doc << ")"; + return doc.str(); +} + +PartitionSpec::PartitionSpec(String spec_name, Target target, PartitionRule rule, + TValidateSubGraphFunc validate_sub_graph_func) { + auto node = runtime::make_object(); + node->spec_name_ = std::move(spec_name); + node->target_ = std::move(target); + node->rule_ = std::move(rule); + node->validate_sub_graph_func_ = std::move(validate_sub_graph_func); + data_ = std::move(node); +} + +} // namespace collage +} // namespace relay +} // namespace tvm diff --git a/src/relay/collage/partition_spec.h b/src/relay/collage/partition_spec.h new file mode 100644 index 0000000000000..e8ce64c684688 --- /dev/null +++ b/src/relay/collage/partition_spec.h @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/collage/partition_spec.h + * \brief Combine a \p PartitionRule with a \p Target. + */ + +#ifndef TVM_RELAY_COLLAGE_PARTITION_SPEC_H_ +#define TVM_RELAY_COLLAGE_PARTITION_SPEC_H_ + +#include +#include +#include + +#include +#include + +#include "./partition_rule.h" +#include "./sub_graph.h" + +namespace tvm { +namespace relay { +namespace collage { + +/*! + * \brief Type of functions for checking the validity of partitions before they proceed to lowering + * and codegen. The argument is the function extracted from the overall expression to represent + * the partition. The result is a non-empty error message string if the candidate should be + * rejected. + */ +using TValidateSubGraphFunc = TypedPackedFunc; + +/*! + * \brief The default validation function. Always returns the empty string, ie no error. + */ +String DefaultValidateSubGraphFunc(const Function& function); + +/*! + * \brief Pairs a \p PartitionRule with one or more \p Targets it can be used for. + */ +class PartitionSpecNode : public Object { + public: + /*! + * \brief Specification name to distinguish this spec from all others. Typically the BYOC + * 'compiler' name, "tvm", or "host". + */ + String spec_name_; + + /*! + * \brief The target all candidate partitions should be compiled for. + * + * It's tempting to support multiple targets here since. Eg the partitioning rules for + * TVM are the same irrespective of whether the target is "cuda" or "llvm", so it would make + * sense to build the candidate partitions first without committing to any target, then 'stamp' + * them for each target as the final step. + * + * However, we want to make sure any predicate in \p DFPatternPartitionRuleNode instances + * can have access to the current target instance. Eg the predicate may need to consult + * build-time configuration to decide what operators, shapes etc are actually supported. + * That implies the specific target is known when the candidate partitions are being constructed. + * + * So for now we'll just force each spec to have exactly one target. + */ + Target target_; + + /*! + * \brief The partition rule to use to gather candidates. + */ + PartitionRule rule_; + + /*! + * \brief The validation function to apply to each candidate's the extracted function before + * proceeding to lowering/codegen. + */ + TValidateSubGraphFunc validate_sub_graph_func_ = DefaultValidateSubGraphFunc; + + void VisitAttrs(AttrVisitor* v); + + /*! + * \brief Returns all the candidate partitions found by this specification. The candidates + * will be for a specific target, but will not yet have an extracted function or cost. + */ + std::vector AllCandidates(const DataflowGraph& dataflow_graph) const; + + std::string ToString() const; + + static constexpr const char* _type_key = "relay.collage.PartitionSpec"; + TVM_DECLARE_FINAL_OBJECT_INFO(PartitionSpecNode, Object); +}; + +class PartitionSpec : public ObjectRef { + public: + PartitionSpec(String spec_name, Target target, PartitionRule rule, + TValidateSubGraphFunc validate_sub_graph_func = DefaultValidateSubGraphFunc); + + TVM_DEFINE_OBJECT_REF_METHODS(PartitionSpec, ObjectRef, PartitionSpecNode); +}; + +} // namespace collage +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_COLLAGE_PARTITION_SPEC_H_ diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc new file mode 100644 index 0000000000000..fab34cd3d32d7 --- /dev/null +++ b/tests/cpp/relay/collage/partition_rule_test.cc @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "../../../src/relay/collage/partition_rule.h" + +#include +#include +#include +#include +#include + +#include "../../../src/relay/collage/partition_spec.h" + +namespace tvm { +namespace relay { +namespace collage { +namespace { + +Constant MakeConstant(std::initializer_list shape) { + return Constant(runtime::NDArray::Empty(shape, DataType::Float(32), {kDLCPU, 0})); +} + +Function MakeTestFunction( + const std::string& mod_text, + std::initializer_list> constant_shapes) { + Array constants; + for (const auto& shape : constant_shapes) { + constants.push_back(MakeConstant(shape)); + } + Map> metatable; + metatable.Set("relay.Constant", constants); + IRModule mod = parser::ParseModule("string", mod_text, {}, metatable); + mod = transform::CapturePostDfsIndexInSpans()(mod); + auto func = Downcast(mod->Lookup("main")); + LOG(INFO) << "------- input function -------"; + LOG(INFO) << PrettyPrint(func); + LOG(INFO) << "------------------------------"; + return func; +} + +Function StandardTestFunction() { + constexpr const char* kMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); // 3 + %1 = nn.relu(%0); // 4 + nn.relu(%1) // 5 + } + )"; + return MakeTestFunction(kMod, /*constant_shapes=*/{}); +} + +std::vector ActualCandidates(const DataflowGraph& graph, const Function& func, + const PartitionSpec& spec, + const PartitionRule& rule) { + auto candidates = rule->AllCandidates(graph, spec); + LOG(INFO) << "--------- actual candidates -------------"; + for (const auto& candidate : candidates) { + LOG(INFO) << candidate->ToString(); + } + LOG(INFO) << "-----------------------------------------"; + return candidates; +} + +std::vector ExpectedCandidates( + const DataflowGraph& graph, const runtime::String rule_name, const PartitionSpec& spec, + const std::vector> index_sets) { + std::vector candidate_partitions; + for (const auto& indexes : index_sets) { + auto subgraph = SubGraph(graph, IndexSet(graph.size(), indexes)); + auto candidate = CandidatePartition(rule_name, subgraph, spec); + candidate_partitions.emplace_back(std::move(candidate)); + } + return candidate_partitions; +} + +void AssertEqual(const std::vector& actual, + const std::vector& expected) { + ASSERT_EQ(actual.size(), expected.size()); + std::set actual_set(actual.begin(), actual.end()); + std::set expected_set(expected.begin(), + expected.end()); + ASSERT_EQ(actual_set.size(), expected_set.size()); + for (const auto& actual_candidate : actual_set) { + ASSERT_EQ(expected_set.count(actual_candidate), 1); + } +} + +TEST(PartitionRule, DFPatternSingleOp) { + auto func = StandardTestFunction(); + auto graph = DataflowGraph(func); + Target target("llvm"); + auto spec = PartitionSpec("test_spec", target, {}); + + { + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto rule = DFPatternPartitionRule("relu_pattern", pattern); + auto expected_candidates = ExpectedCandidates(graph, "relu_pattern", spec, {{4}, {5}}); + + auto candidates = ActualCandidates(graph, func, spec, rule); + + ICHECK_EQ(candidates.size(), 2); + for (size_t i = 0; i < candidates.size(); i++) { + ICHECK(CandidatePartitionEquals()(candidates[i], expected_candidates[i])); + } + } +} + +TEST(PartitionRule, DFPatternOverlap) { + auto func = StandardTestFunction(); + auto graph = DataflowGraph(func); + Target target("llvm"); + auto spec = PartitionSpec("test_spec", target, {}); + + { + auto pattern = + IsOp("nn.relu")({IsOp("nn.relu")({IsWildcard()}) || IsOp("abs")({IsWildcard()})}); + auto rule = DFPatternPartitionRule("relu+abs_pattern", pattern); + + auto candidates = ActualCandidates(graph, func, spec, rule); + + auto expected_candidates = + ExpectedCandidates(graph, "relu+abs_pattern", spec, {{3, 4}, {4, 5}}); + AssertEqual(candidates, expected_candidates); + } +} + +TEST(PartitionRule, Composite) { + auto func = StandardTestFunction(); + auto graph = DataflowGraph(func); + Target target("llvm"); + auto spec = PartitionSpec("test_spec", target, {}); + + { + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto df_rule = DFPatternPartitionRule("relu_pattern", pattern); + auto composite_rule = CompositePartitionRule("composite", df_rule); + + auto candidates = ActualCandidates(graph, func, spec, composite_rule); + auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates); + + ICHECK_EQ(candidates.size(), 2); + + constexpr const char* kExpectedMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); + %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="composite") { + nn.relu(%FunctionVar_01) + }; + %2 = %1(%0); + %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Composite="composite") { + nn.relu(%FunctionVar_0) + }; + %3(%2) + } + )"; + Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{}); + ICHECK(StructuralEqual()(rewrite_expr, expected_expr)); + } +} + +TEST(PartitionRule, PrimitiveTVM) { + auto func = StandardTestFunction(); + auto graph = DataflowGraph(func); + Target target("llvm"); + auto spec = PartitionSpec("test_spec", target, {}); + + { + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto df_rule = DFPatternPartitionRule("relu_pattern", pattern); + auto primitive_rule = PrimitivePartitionRule("primitive", df_rule); + + auto candidates = ActualCandidates(graph, func, spec, primitive_rule); + auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates); + + ICHECK_EQ(candidates.size(), 2); + constexpr const char* kExpectedMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); + %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1) { + nn.relu(%FunctionVar_01) + }; + %2 = %1(%0); + %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1) { + nn.relu(%FunctionVar_0) + }; + %3(%2) + } + )"; + Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{}); + ICHECK(StructuralEqual()(rewrite_expr, expected_expr)); + } +} + +TVM_REGISTER_TARGET_KIND("test_ext_codegen", kDLCUDA) + .set_attr(tvm::attr::kIsExternalCodegen, Bool(true)); + +TEST(PartitionRule, PrimitiveExternal) { + auto func = StandardTestFunction(); + auto graph = DataflowGraph(func); + Target target("test_ext_codegen"); + auto spec = PartitionSpec("test_ext_codegen", target, {}); + + { + auto pattern = IsOp("nn.relu")({IsWildcard()}); + auto df_rule = DFPatternPartitionRule("relu_pattern", pattern); + auto primitive_rule = PrimitivePartitionRule("primitive", df_rule); + + auto candidates = ActualCandidates(graph, func, spec, primitive_rule); + auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates); + + ICHECK_EQ(candidates.size(), 2); + constexpr const char* kExpectedMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); + %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") { + nn.relu(%FunctionVar_01) + }; + %2 = %1(%0); + %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") { + nn.relu(%FunctionVar_0) + }; + %3(%2) + } + )"; + Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{}); + ICHECK(StructuralEqual()(rewrite_expr, expected_expr)); + } +} + +TEST(PartitionRule, Union) { + auto func = StandardTestFunction(); + auto graph = DataflowGraph(func); + Target target("llvm"); + auto spec = PartitionSpec("test_spec", target, {}); + + { + auto abs_pattern = IsOp("abs")({IsWildcard()}); + auto abs_rule = DFPatternPartitionRule("abs_pattern", abs_pattern); + auto relu_pattern = IsOp("nn.relu")({IsWildcard()}); + auto relu_rule = DFPatternPartitionRule("relu_pattern", relu_pattern); + auto union_rule = UnionPartitionRule("union", {abs_rule, relu_rule}); + + auto abs_candidates = ExpectedCandidates(graph, "abs_pattern", spec, {{3}}); + auto relu_candidates = ExpectedCandidates(graph, "relu_pattern", spec, {{4}, {5}}); + + auto candidates = ActualCandidates(graph, func, spec, union_rule); + + std::vector expected_candidates; + expected_candidates.insert(expected_candidates.end(), abs_candidates.begin(), + abs_candidates.end()); + expected_candidates.insert(expected_candidates.end(), relu_candidates.begin(), + relu_candidates.end()); + AssertEqual(candidates, expected_candidates); + } +} + +TEST(PartitionRule, OpCallByKind) { + constexpr const char* kMod = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = abs(%x); // 4 + %1 = add(%0, %x); // 5 + shape_of(%1) // 6 + } + )"; + auto func = MakeTestFunction(kMod, {}); + auto graph = DataflowGraph(func); + Target target("llvm"); + auto spec = PartitionSpec("test_spec", target, {}); + + { + auto rule = OpCallByKindPartitionRule("op_call_by_kind"); + auto candidates = ActualCandidates(graph, func, spec, rule); + + auto expected_candidates = ExpectedCandidates(graph, "op_call_by_kind", spec, {{4}, {5}}); + AssertEqual(candidates, expected_candidates); + } +} + +} // namespace +} // namespace collage +} // namespace relay +} // namespace tvm From 993a8ea094575f2823aebf2b8eb37e9f4ac44d7d Mon Sep 17 00:00:00 2001 From: Rafael Stahl Date: Tue, 12 Jul 2022 18:57:17 +0200 Subject: [PATCH 095/111] [Frontend][TFLite] respect out type of Shape op (#11877) * [Frontend][TFLite] respect out type of Shape op * tests: update for changes to tflite shape handling * lint fix --- python/tvm/relay/frontend/tflite.py | 14 +++++++++++++- tests/python/frontend/tflite/test_forward.py | 12 ++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index c8352a9949e87..239d72055bff6 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -888,10 +888,22 @@ def convert_range(self, op): def convert_shape(self, op): """Convert TFLite Shape""" + try: + from tflite.BuiltinOptions import BuiltinOptions + from tflite.ShapeOptions import ShapeOptions + except ImportError: + raise ImportError("The tflite package must be installed") + input_tensors = self.get_input_tensors(op) assert len(input_tensors) == 1, "input tensors length should be 1" - out = shape_of(self.get_tensor_expr(input_tensors[0])) + assert op.BuiltinOptionsType() == BuiltinOptions.ShapeOptions + op_options = op.BuiltinOptions() + shape_options = ShapeOptions() + shape_options.Init(op_options.Bytes, op_options.Pos) + + out_type = self.get_tensor_type_str(shape_options.OutType()) + out = shape_of(self.get_tensor_expr(input_tensors[0]), dtype=out_type) return out diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index c271a669e95cc..6acc8554b4ddc 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -1767,7 +1767,9 @@ def test_forward_range(): ####################################################################### # Shape # ----- -def test_forward_shape(): + + +def _test_shape(dtype): # tflite 1.13 convert method does not accept empty shapes if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"): tf.reset_default_graph() @@ -1777,7 +1779,8 @@ def test_forward_shape(): limit = tf.placeholder(dtype=tf.int32, shape=[], name="limit") delta = tf.placeholder(dtype=tf.int32, shape=[], name="delta") r = tf.range(start, limit, delta, tf.int32, name="range") - out = tf.shape(r, out_type=tf.dtypes.int32) + out = tf.shape(r, out_type=dtype) + out = tf.add(out, tf.constant([1], dtype=dtype)) compare_tflite_with_tvm( [x for x in np.nditer(data)], ["start", "limit", "delta"], @@ -1787,6 +1790,11 @@ def test_forward_shape(): ) +def test_forward_shape(): + _test_shape(tf.int32) + _test_shape(tf.int64) + + ####################################################################### # Concatenation # ------------- From 6d676badff499a3b87fb47370f2f0d1d1318e8ed Mon Sep 17 00:00:00 2001 From: zhaoyang-star Date: Wed, 13 Jul 2022 00:57:54 +0800 Subject: [PATCH 096/111] [QNN] Replace nn.leaky_relu with qnn.leaky_relu (#11930) * [QNN] Replace nn.leaky_relu with qnn.leaky_relu * jostle ci * fix typo --- python/tvm/relay/frontend/qnn_torch.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py index 0485a993acfbb..824d3bbe64a7f 100644 --- a/python/tvm/relay/frontend/qnn_torch.py +++ b/python/tvm/relay/frontend/qnn_torch.py @@ -937,10 +937,9 @@ def _impl(inputs, _): return _impl -def _leaky_relu(): +def _leaky_relu(fp32_piggy_back=False): # refer to src/ATen/native/quantized/cpu/qrelu.cpp - def _impl(inputs, _): - assert len(inputs) == 7, "Input quant params not found in op inputs" + def _impl_fp32(inputs, _): alpha = inputs[1] output_scale = _expr.const(inputs[3]) output_zero_point = _expr.const(inputs[4]) @@ -952,6 +951,18 @@ def _impl(inputs, _): dequantized, output_scale, output_zero_point, out_dtype="uint8" ) + def _impl_int8(inputs, _): + alpha = inputs[1] + output_scale = _expr.const(inputs[3]) + output_zero_point = _expr.const(inputs[4]) + return relay.qnn.op.leaky_relu(inputs[0], alpha, output_scale, output_zero_point) + + def _impl(inputs, _): + assert len(inputs) == 7, "Input quant params not found in op inputs" + if fp32_piggy_back: + return _impl_fp32(inputs, _) + return _impl_int8(inputs, _) + return _impl From ef5c3ed872c33a1587dd41c6c97dd85350df7269 Mon Sep 17 00:00:00 2001 From: zhaoyang-star Date: Wed, 13 Jul 2022 00:58:11 +0800 Subject: [PATCH 097/111] [QNN] Use sigmoid Lookup Table method instead of fallback to fp32 (#12038) --- python/tvm/relay/frontend/pytorch.py | 2 +- python/tvm/relay/frontend/qnn_torch.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index d7e1a5dd1ddb7..7532f643dee4c 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -1567,7 +1567,7 @@ def func(x): assert len(inputs) == 3, "Input quant param not found in op inputs" input_scale = _expr.const(inputs[1]) input_zero_point = _expr.const(inputs[2]) - return qnn_torch.apply_with_fp32_fallback(data, input_scale, input_zero_point, func) + return qnn_torch.quantized_sigmoid(data, input_scale, input_zero_point) return func(data) diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py index 824d3bbe64a7f..251f46630ab38 100644 --- a/python/tvm/relay/frontend/qnn_torch.py +++ b/python/tvm/relay/frontend/qnn_torch.py @@ -571,6 +571,14 @@ def quantized_relu(data, input_zero_point): return _op.tensor.maximum(data, zp) +def quantized_sigmoid(data, input_scale, input_zero_point): + output_scale = input_scale + output_zero_point = input_zero_point + return relay.qnn.op.sigmoid( + data, input_scale, input_zero_point, output_scale, output_zero_point + ) + + def _quantize_per_tensor(): def _impl(inputs, _): dim = len(infer_shape(inputs[0])) From 175e3a77b1124c1c1ef2e8b9741dce553e889671 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Tue, 12 Jul 2022 19:19:37 +0100 Subject: [PATCH 098/111] [docs][tvmc] Fix ResNet50 model URL (#12040) Fix the ResNet50 Models in both tvmc tutorials so that the commands suggested will work fine. Co-Authored-By: Liam Sturge Co-authored-by: Liam Sturge --- gallery/tutorial/tvmc_command_line_driver.py | 2 +- gallery/tutorial/tvmc_python.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py index 8a60f12a05b28..39e5f06311cd6 100644 --- a/gallery/tutorial/tvmc_command_line_driver.py +++ b/gallery/tutorial/tvmc_command_line_driver.py @@ -94,7 +94,7 @@ # # .. code-block:: bash # -# wget https://github.com/onnx/models/raw/652f4e4af7975c8e7a505c4b6e0f8ac72d8260ea/vision/classification/resnet/model/resnet50-v2-7.onnx +# wget https://github.com/onnx/models/raw/b9a54e89508f101a1611cd64f4ef56b9cb62c7cf/vision/classification/resnet/model/resnet50-v2-7.onnx # ################################################################################ diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py index 28b0a97450461..9658036a2cc63 100644 --- a/gallery/tutorial/tvmc_python.py +++ b/gallery/tutorial/tvmc_python.py @@ -29,7 +29,7 @@ mkdir myscripts cd myscripts - wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx + wget https://github.com/onnx/models/raw/b9a54e89508f101a1611cd64f4ef56b9cb62c7cf/vision/classification/resnet/model/resnet50-v2-7.onnx mv resnet50-v2-7.onnx my_model.onnx touch tvmcpythonintro.py From ad44a0fe153e3394a7c987c640693b3e3c6764d7 Mon Sep 17 00:00:00 2001 From: Qingchao Shen Date: Wed, 13 Jul 2022 03:03:59 +0800 Subject: [PATCH 099/111] fix some typo in conv2d.py (#12067) --- python/tvm/topi/nn/conv2d.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 5db752f6d54f0..d23b8d857e4e2 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -977,9 +977,9 @@ def _conv2d_winograd_nhwc_impl( Parameters ---------- - data : tvm.Tensor + data : tvm.te.Tensor 4-D with shape [batch, in_height, in_width, in_channel] - weight : tvm.Tensor + weight : tvm.te.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] @@ -1000,7 +1000,7 @@ def _conv2d_winograd_nhwc_impl( Returns ------- - output : tvm.Tensor + output : tvm.te.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ N, H, W, CI = get_const_tuple(data.shape) @@ -1159,9 +1159,9 @@ def conv2d_winograd_nhwc( Parameters ---------- - data : tvm.Tensor + data : tvm.te.Tensor 4-D with shape [batch, in_height, in_width, in_channel] - weight : tvm.Tensor + weight : tvm.te.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] @@ -1180,7 +1180,7 @@ def conv2d_winograd_nhwc( Returns ------- - output : tvm.Tensor + output : tvm.te.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ tile_size = 4 @@ -1214,9 +1214,9 @@ def conv2d_winograd_nhwc_without_weight_transform( Parameters ---------- - data : tvm.Tensor + data : tvm.te.Tensor 4-D with shape [batch, in_height, in_width, in_channel] - weight : tvm.Tensor + weight : tvm.te.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] @@ -1233,7 +1233,7 @@ def conv2d_winograd_nhwc_without_weight_transform( Returns ------- - output : tvm.Tensor + output : tvm.te.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ From deda4d59686806afcd722c3a179016aebeed42ce Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Tue, 12 Jul 2022 13:09:35 -0700 Subject: [PATCH 100/111] [MetaSchedule][Test] Add unittests for DEP (#12071) --- .../unittest/test_meta_schedule_space_cpu.py | 161 ++++++++++++++++++ .../unittest/test_meta_schedule_space_cuda.py | 89 ++++++++++ 2 files changed, 250 insertions(+) diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py index 87f61ec32880a..d757d4bef71df 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu.py @@ -741,8 +741,169 @@ def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[( ) +def test_cpu_dep(): + # fmt: off + @T.prim_func + def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32") + depth_conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 32], dtype="float32") + for i0, i1, i2, i3 in T.grid(1, 114, 114, 32): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1]) + PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 113 and 1 <= i2_1 and i2_1 < 113, placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1 in T.grid(1, 1, 1, 1, 1, 4, 4, 8): + for i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2): + with T.block("depth_conv2d_nhwc"): + n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3) + h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3) + w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3) + c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3) + rh = T.axis.reduce(3, i4_0 * 3 + i4_1) + rw = T.axis.reduce(3, i5_0 * 3 + i5_1) + T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c]) + T.writes(depth_conv2d_nhwc_global[n, h, w, c]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + depth_conv2d_nhwc_global[n, h, w, c] = T.float32(0) + depth_conv2d_nhwc_global[n, h, w, c] = depth_conv2d_nhwc_global[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c] + for ax0, ax1, ax2, ax3 in T.grid(1, 28, 28, 4): + with T.block("depth_conv2d_nhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(112, i1_1_1 * 28 + ax1) + v2 = T.axis.spatial(112, i2_1_1 * 28 + ax2) + v3 = T.axis.spatial(32, i3_1_1 * 4 + ax3) + T.reads(depth_conv2d_nhwc_global[v0, v1, v2, v3]) + T.writes(depth_conv2d_nhwc[v0, v1, v2, v3]) + depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_global[v0, v1, v2, v3] + @T.prim_func + def dep_1(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32") + depth_conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 32], dtype="float32") + for i0, i1, i2, i3 in T.grid(1, 114, 114, 32): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1]) + PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 113 and 1 <= i2_1 and i2_1 < 113, placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32") + for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 1, 1, 1): + for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2): + with T.block("depth_conv2d_nhwc"): + n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3) + h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3) + w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3) + c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3) + rh = T.axis.reduce(3, i4_0 * 3 + i4_1) + rw = T.axis.reduce(3, i5_0 * 3 + i5_1) + T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c]) + T.writes(depth_conv2d_nhwc_global[n, h, w, c]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + depth_conv2d_nhwc_global[n, h, w, c] = T.float32(0) + depth_conv2d_nhwc_global[n, h, w, c] = depth_conv2d_nhwc_global[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c] + for ax0, ax1, ax2, ax3 in T.grid(1, 112, 112, 32): + with T.block("depth_conv2d_nhwc_global"): + v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3]) + T.reads(depth_conv2d_nhwc_global[v0, v1, v2, v3]) + T.writes(depth_conv2d_nhwc[v0, v1, v2, v3]) + depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_global[v0, v1, v2, v3] + @T.prim_func + def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1 in T.grid(1, 1, 1, 1, 1, 4): + for ax0, ax1, ax2, ax3 in T.grid(1, 30, 114, 32): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(114, i1_1 * 28 + ax1) + i2, i3 = T.axis.remap("SS", [ax2, ax3]) + T.reads(placeholder[i0, i1 - 1, i2 - 1, i3]) + T.writes(PadInput[i0, i1, i2, i3]) + PadInput[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 113 and 1 <= i2 and i2 < 113, placeholder[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32") + for i2_1, i3_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2): + with T.block("depth_conv2d_nhwc"): + n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3) + h = T.axis.spatial(112, i1_0 * 112 + i1_1 * 28 + i1_2 * 14 + i1_3) + w = T.axis.spatial(112, i2_0 * 112 + i2_1 * 28 + i2_2 * 4 + i2_3) + c = T.axis.spatial(32, i3_0 * 32 + i3_1 * 4 + i3_2 * 2 + i3_3) + rh = T.axis.reduce(3, i4_0 * 3 + i4_1) + rw = T.axis.reduce(3, i5_0 * 3 + i5_1) + T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c]) + T.writes(depth_conv2d_nhwc[n, h, w, c]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + depth_conv2d_nhwc[n, h, w, c] = T.float32(0) + depth_conv2d_nhwc[n, h, w, c] = depth_conv2d_nhwc[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 4, 2, 14]), + ("SamplePerfectTile", [1, 4, 7, 4]), + ("SamplePerfectTile", [1, 8, 2, 2]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 2), + ("SampleComputeLocation", -1), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 4, 2, 14]), + ("SamplePerfectTile", [1, 4, 7, 4]), + ("SamplePerfectTile", [1, 8, 2, 2]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 1), + ("SampleComputeLocation", -1), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 4, 2, 14]), + ("SamplePerfectTile", [1, 4, 7, 4]), + ("SamplePerfectTile", [1, 8, 2, 2]), + ("SamplePerfectTile", [1, 3]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 0), + ("SampleComputeLocation", 5), + ] + mod = create_te_workload("DEP", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[dep_0, dep_1, dep_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + if __name__ == "__main__": test_cpu_c1d() test_cpu_c2d() test_cpu_c3d() test_cpu_cap() + test_cpu_dep() diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index bffb80436cad5..826a1ca062b58 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -396,8 +396,97 @@ def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[( ) +def test_cuda_dep(): + # fmt: off + @T.prim_func + def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.unroll_explicit":16}) + depth_conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 32], dtype="float32", scope="local") + PadInput_shared = T.alloc_buffer([1, 114, 114, 32], dtype="float32", scope="shared") + placeholder_shared = T.alloc_buffer([1, 3, 3, 32], dtype="float32", scope="shared") + for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(1, thread="blockIdx.x"): + for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(8, thread="vthread.x"): + for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(14, thread="threadIdx.x"): + for i4_0, i5_0 in T.grid(1, 1): + for ax0_ax1_ax2_ax3_fused in T.serial(415872): + with T.block("PadInput_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(114, ax0_ax1_ax2_ax3_fused // 3648) + v2 = T.axis.spatial(114, ax0_ax1_ax2_ax3_fused % 3648 // 32) + v3 = T.axis.spatial(32, ax0_ax1_ax2_ax3_fused % 32) + T.reads(placeholder[v0, v1 - 1, v2 - 1, v3]) + T.writes(PadInput_shared[v0, v1, v2, v3]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + PadInput_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 113 and 1 <= v2 and v2 < 113, placeholder[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32") + for ax0_ax1_ax2_ax3_fused in T.serial(288): + with T.block("placeholder_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 96) + v2 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 96 // 32) + v3 = T.axis.spatial(32, ax0_ax1_ax2_ax3_fused % 32) + T.reads(placeholder_1[v0, v1, v2, v3]) + T.writes(placeholder_shared[v0, v1, v2, v3]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3] + for i4_1, i5_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i0_4, i1_4, i2_4, i3_4 in T.grid(3, 1, 1, 4, 16, 8, 1, 3, 1, 7, 1, 1): + with T.block("depth_conv2d_nhwc"): + n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0) + h = T.axis.spatial(112, ((0 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 8 // 2 + 0) * 4 + i1_3) * 7 + i1_4) + w = T.axis.spatial(112, ((0 + 0) * 7 + i0_2_i1_2_i2_2_i3_2_fused % 14 // 2) * 16 + i2_3 + i2_4) + c = T.axis.spatial(32, ((0 * 2 + i0_1_i1_1_i2_1_i3_1_fused % 2) * 2 + i0_2_i1_2_i2_2_i3_2_fused % 2) * 8 + i3_3 + i3_4) + rh = T.axis.reduce(3, i4_0 * 3 + i4_1 + i4_2) + rw = T.axis.reduce(3, (i5_0 + i5_1) * 3 + i5_2) + T.reads(PadInput_shared[n, h + rh, w + rw, c], placeholder_shared[0, rh, rw, c]) + T.writes(depth_conv2d_nhwc_local[n, h, w, c]) + T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) + with T.init(): + depth_conv2d_nhwc_local[n, h, w, c] = T.float32(0) + depth_conv2d_nhwc_local[n, h, w, c] = depth_conv2d_nhwc_local[n, h, w, c] + PadInput_shared[n, h + rh, w + rw, c] * placeholder_shared[0, rh, rw, c] + for ax0, ax1, ax2, ax3 in T.grid(1, 28, 16, 8): + with T.block("depth_conv2d_nhwc_local"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused // 2 * 28 + ax1) + v2 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused // 2 * 16 + ax2) + v3 = T.axis.spatial(32, i0_1_i1_1_i2_1_i3_1_fused % 2 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 8 + ax3) + T.reads(depth_conv2d_nhwc_local[v0, v1, v2, v3]) + T.writes(depth_conv2d_nhwc[v0, v1, v2, v3]) + depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_local[v0, v1, v2, v3] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [1, 4, 1, 4, 7]), + ("SamplePerfectTile", [1, 1, 7, 16, 1]), + ("SamplePerfectTile", [1, 2, 2, 8, 1]), + ("SamplePerfectTile", [1, 3, 1]), + ("SamplePerfectTile", [1, 1, 3]), + ("SampleCategorical", 2), + ("SampleCategorical", 2), + ("SampleCategorical", 1), + ] + mod = create_te_workload("DEP", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[dep_0], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": test_cuda_c1d() test_cuda_c2d() test_cuda_c3d() test_cuda_cap() + test_cuda_dep() From 6536def6f9073c1c03ab61d33ebda9d31867862a Mon Sep 17 00:00:00 2001 From: arangasa <76030063+arangasa@users.noreply.github.com> Date: Wed, 13 Jul 2022 01:45:48 +0530 Subject: [PATCH 101/111] [Topi][Hexagon] Implement Cast F32ToF16 and F16ToF32 Slice Op (#11561) --- python/tvm/topi/hexagon/slice_ops/__init__.py | 6 + python/tvm/topi/hexagon/slice_ops/cast.py | 143 +++++++++++++ python/tvm/topi/hexagon/utils.py | 14 ++ .../contrib/test_hexagon/infrastructure.py | 12 ++ .../test_hexagon/topi/test_cast_slice.py | 199 ++++++++++++++++++ 5 files changed, 374 insertions(+) create mode 100644 python/tvm/topi/hexagon/slice_ops/cast.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_cast_slice.py diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index 617aaed920d7d..931b703d7313e 100644 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -23,5 +23,11 @@ from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule from .softmax_slice import * from .clip import * +from .cast import ( + cast_f16_f32_compute, + cast_f16_f32_schedule, + cast_f32_f16_compute, + cast_f32_f16_schedule, +) from .conv2d import * from .reshape import reshape_compute, reshape_stir_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/cast.py b/python/tvm/topi/hexagon/slice_ops/cast.py new file mode 100644 index 0000000000000..b4984763e0e05 --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/cast.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" Hexagon slice cast op compute and schedule""" + +from tvm import te +from tvm import tir +from ..utils import get_layout_transform_fn + + +def get_layout_transform_for_f32(f32_layout_string): + """ + Given f32 layout string, return transform_layout function and + channel/height split factor to be used for scheduling + """ + layout_transform_fn = get_layout_transform_fn(f32_layout_string) + if f32_layout_string == "nhwc-8h2w32c2w-2d": + return [layout_transform_fn, 8] + if f32_layout_string == "nhwc-4h2w32c2w-2d": + return [layout_transform_fn, 4] + if f32_layout_string == "nc-1024c-2d": + return [layout_transform_fn, 1024] + if f32_layout_string == "nc-512c-2d": + return [layout_transform_fn, 512] + raise RuntimeError(f"Unexpected f32_layout '{f32_layout_string}'") + + +def cast_f16_f32_compute(in_tensor): + out_tensor = te.compute( + in_tensor.shape, lambda *indices: in_tensor[indices].astype("float32"), name="CastF16F32" + ) + return out_tensor + + +def cast_f16_f32_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor): + """Schedule for nhwc f16 to f32 cast: nhwc layout""" + sch = tir.Schedule(func, debug_mask="all") + block_name = "CastF16F32" + n_orig, h_orig, w_orig, c_orig = sch.get_loops(sch.get_block(block_name)) + h_outer, h_inner = sch.split(h_orig, [None, h_split_factor]) + w_outer, w_inner = sch.split(w_orig, [None, 4]) + c_outer, c_inner = sch.split(c_orig, [None, 32]) + w_inner_o, w_inner_i = sch.split(w_inner, [None, 2]) + sch.reorder(n_orig, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i) + sch.transform_layout(block_name, "A", in_layout) + sch.transform_layout(block_name, block_name, out_layout) + fused = sch.fuse(c_inner, w_inner_i) + sch.vectorize(fused) + return sch + + +def cast_f16_f32_stir_schedule_nc(func, in_layout, out_layout, c_split_factor): + """Schedule for nc f16 to f32 cast: nc layout""" + sch = tir.Schedule(func, debug_mask="all") + block_name = "CastF16F32" + _, c_orig = sch.get_loops(sch.get_block(block_name)) + _, c_inner = sch.split(c_orig, [None, c_split_factor]) + sch.transform_layout(block_name, "A", in_layout) + sch.transform_layout(block_name, block_name, out_layout) + sch.vectorize(c_inner) + return sch + + +def cast_f16_f32_schedule(cast_func, in_layout_str, out_layout_str): + """Schedule for f16 to f32 cast: top level function""" + f32_layout_transform_func, split_factor = get_layout_transform_for_f32(out_layout_str) + f16_layout_transform_func = get_layout_transform_fn(in_layout_str) + if in_layout_str == "nhwc-8h2w32c2w-2d": + return cast_f16_f32_stir_schedule_nhwc( + cast_func, + f16_layout_transform_func, + f32_layout_transform_func, + split_factor, + ) + if in_layout_str == "nc-1024c-2d": + return cast_f16_f32_stir_schedule_nc( + cast_func, f16_layout_transform_func, f32_layout_transform_func, split_factor + ) + raise RuntimeError(f"Unexpected input_layout, output_layout '{input_layout, output_layout}'") + + +def cast_f32_f16_compute(in_tensor): + out_tensor = te.compute( + in_tensor.shape, lambda *indices: in_tensor[indices].astype("float16"), name="CastF32F16" + ) + return out_tensor + + +def cast_f32_f16_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor): + """Schedule for nhwc f32 to f16 cast: nhwc layout""" + sch = tir.Schedule(func, debug_mask="all") + block_name = "CastF32F16" + n_orig, h_orig, w_orig, c_orig = sch.get_loops(sch.get_block(block_name)) + h_outer, h_inner = sch.split(h_orig, [None, h_split_factor]) + w_outer, w_inner = sch.split(w_orig, [None, 4]) + c_outer, c_inner = sch.split(c_orig, [None, 32]) + w_inner_o, w_inner_i = sch.split(w_inner, [None, 2]) + sch.reorder(n_orig, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i) + sch.transform_layout(block_name, "A", in_layout) + sch.transform_layout(block_name, block_name, out_layout) + fused = sch.fuse(c_inner, w_inner_i) + sch.vectorize(fused) + return sch + + +def cast_f32_f16_stir_schedule_nc(func, in_layout, out_layout, c_split_factor): + """Schedule for nc f32 to f16 cast: nc layout""" + sch = tir.Schedule(func, debug_mask="all") + block_name = "CastF32F16" + _, c_orig = sch.get_loops(sch.get_block(block_name)) + _, c_inner = sch.split(c_orig, [None, c_split_factor]) + sch.transform_layout(block_name, "A", in_layout) + sch.transform_layout(block_name, block_name, out_layout) + sch.vectorize(c_inner) + return sch + + +def cast_f32_f16_schedule(cast_func, in_layout_str, out_layout_str): + """Schedule for f32 to f16 cast: top level function""" + f32_layout_transform_func, split_factor = get_layout_transform_for_f32(in_layout_str) + f16_layout_transform_func = get_layout_transform_fn(out_layout_str) + if out_layout_str == "nhwc-8h2w32c2w-2d": + return cast_f32_f16_stir_schedule_nhwc( + cast_func, f32_layout_transform_func, f16_layout_transform_func, split_factor + ) + if out_layout_str == "nc-1024c-2d": + return cast_f32_f16_stir_schedule_nc( + cast_func, f32_layout_transform_func, f16_layout_transform_func, split_factor + ) + raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'") diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index 58792fc3294fb..4458c55e62739 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -67,6 +67,16 @@ def nc_512c_2d(n, c): return [n, c // 512, te.AXIS_SEPARATOR, c % 512] +def nc_1024c_2d(n, c): + """Return index map for nc_1024c 2d layout""" + return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024] + + +def nhwc_4h2w32c2w_2d(n, h, w, c): + """Return index map for nhwc_4h2w32c2w 2d layout""" + return [n, h // 4, w // 4, c // 32, te.AXIS_SEPARATOR, h % 4, (w % 4) // 2, c % 32, w % 2] + + def nhwc_1024c_2d(n, h, w, c): """Return index map for nhwc_1024 2d layout""" return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024] @@ -113,6 +123,10 @@ def get_layout_transform_fn(layout): return nc_512c_2d if layout == "nc-512c-1d": return nc_512c_1d + if layout == "nhwc-4h2w32c2w-2d": + return nhwc_4h2w32c2w_2d + if layout == "nc-1024c-2d": + return nc_1024c_2d if layout == "iohw-16i32o2i-1d": return iohw_16i32o2i_1d raise RuntimeError(f"Unexpected layout '{layout}'") diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index 53351854a06a3..a1fbfdefcdbd3 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -241,6 +241,11 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): return arr_np.reshape([n, h // 8, 8, w // 4, 2, 2, c // 32, 32]).transpose( 0, 1, 3, 6, 2, 4, 7, 5 ) + if new_layout in ["nhwc-4h2w32c2w-2d"]: + n, h, w, c = arr_np.shape + return arr_np.reshape([n, h // 4, 4, w // 4, 2, 2, c // 32, 32]).transpose( + 0, 1, 3, 6, 2, 4, 7, 5 + ) if new_layout in ["n11c-1024c-2d", "n11c-1024c-1d"]: n, h, w, c = arr_np.shape assert h == 1 and w == 1, "The size of h and w must be 1" @@ -251,7 +256,14 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): if new_layout == "nhwc-1024c-2d": N, H, W, C = arr_np.shape return arr_np.reshape([N, H, W, C // 1024, 1024]) + raise RuntimeError(f"Unexpected new_layout '{new_layout}'") + if current_layout == "nc": + n, c = arr_np.shape + if new_layout in ["nc-1024c-2d"]: + return arr_np.reshape([n, c // 1024, 1024]) + if new_layout in ["nc-512c-2d"]: + return arr_np.reshape([n, c // 512, 512]) raise RuntimeError(f"Unexpected new_layout '{new_layout}'") if current_layout == "nhw": diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py new file mode 100644 index 0000000000000..30ea4c94b8b16 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" Tests for Hexagon slice cast ops """ +import numpy as np + +import tvm +import tvm.testing +from tvm import te +import tvm.topi.hexagon.slice_ops as sl +from ..infrastructure import allocate_hexagon_array, transform_numpy + + +class TestCastF16F32Slice2d: + """ + For testing Cast F16 to F32 Slice ops + """ + + input_shape, orig_layout, input_layout, output_layout, axis_sep = tvm.testing.parameters( + ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]), + ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]), + ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-4h2w32c2w-2d", [4]), + ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-4h2w32c2w-2d", [4]), + ((1, 1024), "nc", "nc-1024c-2d", "nc-1024c-2d", [2]), + ((1, 1024), "nc", "nc-1024c-2d", "nc-512c-2d", [2]), + ) + dtype = tvm.testing.parameter("float16") + working_scope = tvm.testing.parameter("global.vtcm") + + @tvm.testing.fixture + def input_np(self, input_shape, dtype): + return np.random.uniform(size=input_shape).astype(dtype) + + @tvm.testing.fixture + def transformed_input_np(self, input_np, orig_layout, input_layout): + return transform_numpy(input_np, orig_layout, input_layout) + + @tvm.testing.fixture + def expected_output_np(self, input_np): + ref_np = input_np.astype("float32") + return ref_np + + @tvm.testing.fixture + def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout): + return transform_numpy(expected_output_np, orig_layout, output_layout) + + @tvm.testing.requires_hexagon + def test_cast_fp16_fp32_slice( + self, + input_shape, + dtype, + input_layout, + output_layout, + transformed_input_np, + transformed_expected_output_np, + axis_sep, + hexagon_session, + working_scope, + ): + """ + Top level testing function for cast fp16 to fp32 + """ + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") + target_hexagon = tvm.target.hexagon("v68") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + cast_input = te.placeholder(input_shape, name="A", dtype=dtype) + cast_output = sl.cast_f16_f32_compute(cast_input) + cast_func = te.create_prim_func([cast_input, cast_output]) + tir_s = sl.cast_f16_f32_schedule(cast_func, input_layout, output_layout) + input_data = allocate_hexagon_array( + hexagon_session.device, + data=transformed_input_np, + axis_separators=axis_sep, + mem_scope=working_scope, + ) + output_data = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=transformed_expected_output_np.shape, + dtype=transformed_expected_output_np.dtype, + axis_separators=axis_sep, + mem_scope=working_scope, + ) + with tvm.transform.PassContext(opt_level=3): + tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f16_f32") + runtime_module = tvm.build(tir_irm, target=target, name="cast_f16_f32") + mod = hexagon_session.load_module(runtime_module) + + mod(input_data, output_data) + output_np = output_data.numpy() + tvm.testing.assert_allclose( + output_np, + transformed_expected_output_np, + 1e-3, + 1e-3, + ) + + +class TestCastF32F16Slice2d: + """ + For testing Cast F32 to F16 Slice ops + """ + + (input_shape, orig_layout, input_layout, output_layout, axis_sep,) = tvm.testing.parameters( + ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]), + ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]), + ((1, 16, 12, 64), "nhwc", "nhwc-4h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]), + ((1, 64, 64, 32), "nhwc", "nhwc-4h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]), + ((1, 1024), "nc", "nc-1024c-2d", "nc-1024c-2d", [2]), + ((1, 1024), "nc", "nc-512c-2d", "nc-1024c-2d", [2]), + ) + dtype = tvm.testing.parameter("float32") + working_scope = tvm.testing.parameter("global.vtcm") + + @tvm.testing.fixture + def input_np(self, input_shape, dtype): + return np.random.uniform(size=input_shape).astype(dtype) + + @tvm.testing.fixture + def transformed_input_np(self, input_np, orig_layout, input_layout): + return transform_numpy(input_np, orig_layout, input_layout) + + @tvm.testing.fixture + def expected_output_np(self, input_np): + ref_np = input_np.astype("float16") + return ref_np + + @tvm.testing.fixture + def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout): + return transform_numpy(expected_output_np, orig_layout, output_layout) + + @tvm.testing.requires_hexagon + def test_cast_fp32_fp16_slice( + self, + input_shape, + dtype, + input_layout, + output_layout, + transformed_input_np, + transformed_expected_output_np, + axis_sep, + hexagon_session, + working_scope, + ): + """ + Top level testing function for cast fp32 to fp16 + """ + if hexagon_session._launcher._serial_number != "simulator": + pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") + + target_hexagon = tvm.target.hexagon("v68") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + cast_input = te.placeholder(input_shape, name="A", dtype=dtype) + cast_output = sl.cast_f32_f16_compute(cast_input) + cast_func = te.create_prim_func([cast_input, cast_output]) + tir_s = sl.cast_f32_f16_schedule(cast_func, input_layout, output_layout) + input_data = allocate_hexagon_array( + hexagon_session.device, + data=transformed_input_np, + axis_separators=axis_sep, + mem_scope=working_scope, + ) + output_data = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=transformed_expected_output_np.shape, + dtype=transformed_expected_output_np.dtype, + axis_separators=axis_sep, + mem_scope=working_scope, + ) + with tvm.transform.PassContext(opt_level=3): + tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f32_f16") + runtime_module = tvm.build(tir_irm, target=target, name="cast_f32_f16") + mod = hexagon_session.load_module(runtime_module) + + mod(input_data, output_data) + output_np = output_data.numpy() + tvm.testing.assert_allclose( + output_np, + transformed_expected_output_np, + 1e-3, + 1e-3, + ) + + +if __name__ == "__main__": + tvm.testing.main() From b1a3817602b7b92a9bb333863e4206b99307ce12 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Tue, 12 Jul 2022 13:55:24 -0700 Subject: [PATCH 102/111] [Relay] Move TOpPattern registration for nn.* to C++ (#12072) * [Relay] Move TOpPattern registration for nn.* to C++ Some of the Collage machinery is best tested from C++, but requires Relay ops to have their "TOpPattern" registered. However since the nn.* ops register on the Python side tests can't rely on those ops. The easy fix is to just move the registration to the RELAY_REGISTER_OP block. However since kOpaque is the default I did not preserve those registrations. There's still a few dozen more exotic ops still registered on the Python side. I've left them be. * - D'oh! Even kOpaque ops must be registered. --- python/tvm/relay/op/nn/_nn.py | 72 +--------------------------------- src/relay/op/nn/bitserial.cc | 9 +++-- src/relay/op/nn/convolution.cc | 51 ++++++++++++++++-------- src/relay/op/nn/correlation.cc | 3 +- src/relay/op/nn/nn.cc | 56 ++++++++++++++++++-------- src/relay/op/nn/pooling.cc | 16 ++++++++ src/relay/op/nn/sparse.cc | 15 ++++--- 7 files changed, 109 insertions(+), 113 deletions(-) diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index 3e16cae88db1b..ff213f0983194 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -31,27 +31,21 @@ from .. import op as reg from .. import strategy from .._tensor import elemwise_shape_func -from ..op import OpPattern from ..strategy.generic import is_depthwise_conv2d # relu reg.register_broadcast_schedule("nn.relu") -reg.register_pattern("nn.relu", OpPattern.ELEMWISE) - # softmax reg.register_strategy("nn.softmax", strategy.softmax_strategy) -reg.register_pattern("nn.softmax", OpPattern.OUT_ELEMWISE_FUSABLE) # fast softmax reg.register_strategy("nn.fast_softmax", strategy.fast_softmax_strategy) -reg.register_pattern("nn.fast_softmax", OpPattern.OUT_ELEMWISE_FUSABLE) # log_softmax reg.register_strategy("nn.log_softmax", strategy.log_softmax_strategy) -reg.register_pattern("nn.log_softmax", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.matmul") @@ -77,7 +71,6 @@ def legalize_matmul(attrs, inputs, types): # matmul reg.register_strategy("nn.matmul", strategy.matmul_strategy) -reg.register_pattern("nn.matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.dense") @@ -103,7 +96,6 @@ def legalize_dense(attrs, inputs, types): # dense reg.register_strategy("nn.dense", strategy.dense_strategy) -reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_alter_op_layout("nn.dense") @@ -114,7 +106,6 @@ def alter_op_layout_dense(attrs, inputs, tinfos, out_type): # dense_pack reg.register_strategy("nn.contrib_dense_pack", strategy.dense_pack_strategy) -reg.register_pattern("nn.contrib_dense_pack", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # fifo_buffer @@ -124,7 +115,6 @@ def compute_fifo_buffer(attrs, inputs, out_type): reg.register_injective_schedule("nn.fifo_buffer") -reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE) @reg.register_legalize("nn.batch_matmul") @@ -150,12 +140,10 @@ def legalize_batch_matmul(attrs, inputs, types): # batch_matmul reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy) -reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # batch_norm reg.register_strategy("nn.batch_norm", strategy.batch_norm_strategy) -reg.register_pattern("nn.batch_norm", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # sparse_dense @@ -166,7 +154,6 @@ def compute_sparse_dense(attrs, inputs, out_type): reg.register_strategy("nn.sparse_dense", strategy.sparse_dense_strategy) -reg.register_pattern("nn.sparse_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_alter_op_layout("nn.sparse_dense") @@ -177,7 +164,6 @@ def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type): # sparse_add reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy) -reg.register_pattern("nn.sparse_add", reg.OpPattern.OPAQUE) @reg.register_compute("nn.internal.sparse_dense_padded") @@ -187,7 +173,6 @@ def compute_sparse_dense_padded(attrs, inputs, out_type): reg.register_strategy("nn.internal.sparse_dense_padded", strategy.sparse_dense_padded_strategy) -reg.register_pattern("nn.internal.sparse_dense_padded", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # sparse_transpose @@ -198,7 +183,6 @@ def compute_sparse_transpose(attrs, inputs, out_type): reg.register_schedule("nn.sparse_transpose", strategy.schedule_sparse_transpose) -reg.register_pattern("nn.sparse_transpose", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # sparse_conv2d @@ -213,17 +197,14 @@ def compute_sparse_conv2d(attrs, inputs, out_type): reg.register_strategy("nn.sparse_conv2d", strategy.sparse_conv2d_strategy) -reg.register_pattern("nn.sparse_conv2d", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # conv1d reg.register_strategy("nn.conv1d", strategy.conv1d_strategy) -reg.register_pattern("nn.conv1d", OpPattern.OUT_ELEMWISE_FUSABLE) # conv2d reg.register_strategy("nn.conv2d", strategy.conv2d_strategy) -reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_alter_op_layout("nn.conv2d") @@ -331,7 +312,6 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layouts): # conv2d_transpose reg.register_strategy("nn.conv2d_transpose", strategy.conv2d_transpose_strategy) -reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.conv2d_transpose") @@ -400,7 +380,6 @@ def convert_conv2d_transpose(attrs, inputs, tinfos, desired_layouts): # conv3d_transpose reg.register_strategy("nn.conv3d_transpose", strategy.conv3d_transpose_strategy) -reg.register_pattern("nn.conv3d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.conv3d_transpose") @@ -426,7 +405,6 @@ def legalize_conv3d_transpose(attrs, inputs, types): # conv3d reg.register_strategy("nn.conv3d", strategy.conv3d_strategy) -reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_alter_op_layout("nn.conv3d") @@ -483,9 +461,6 @@ def convert_conv3d(attrs, inputs, tinfos, desired_layouts): "nn.contrib_conv3d_winograd_without_weight_transform", strategy.conv3d_winograd_without_weight_transfrom_strategy, ) -reg.register_pattern( - "nn.contrib_conv3d_winograd_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE -) @reg.register_compute("nn.contrib_conv3d_winograd_weight_transform") @@ -499,27 +474,22 @@ def compute_contrib_conv3d_winograd_weight_transform(attrs, inputs, out_dtype): "nn.contrib_conv3d_winograd_weight_transform", strategy.schedule_conv3d_winograd_weight_transform, ) -reg.register_pattern("nn.contrib_conv3d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE) # conv1d_transpose reg.register_strategy("nn.conv1d_transpose", strategy.conv1d_transpose_strategy) -reg.register_pattern("nn.conv1d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE) # bias_add reg.register_injective_schedule("nn.bias_add") -reg.register_pattern("nn.bias_add", OpPattern.BROADCAST) # max_pool1d reg.register_schedule("nn.max_pool1d", strategy.schedule_pool) -reg.register_pattern("nn.max_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE) # max_pool2d reg.register_schedule("nn.max_pool2d", strategy.schedule_pool) -reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_convert_op_layout("nn.max_pool2d") @@ -548,17 +518,14 @@ def convert_max_pool2d(attrs, inputs, tinfos, desired_layouts): # max_pool3d reg.register_schedule("nn.max_pool3d", strategy.schedule_pool) -reg.register_pattern("nn.max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool1d reg.register_schedule("nn.avg_pool1d", strategy.schedule_pool) -reg.register_pattern("nn.avg_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool2d reg.register_schedule("nn.avg_pool2d", strategy.schedule_pool) -reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_convert_op_layout("nn.avg_pool2d") @@ -587,32 +554,26 @@ def convert_avg_pool2d(attrs, inputs, tinfos, desired_layouts): # avg_pool3d reg.register_schedule("nn.avg_pool3d", strategy.schedule_pool) -reg.register_pattern("nn.avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE) # max_pool2d_grad reg.register_schedule("nn.max_pool2d_grad", strategy.schedule_pool_grad) -reg.register_pattern("nn.max_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool2d_grad reg.register_schedule("nn.avg_pool2d_grad", strategy.schedule_pool_grad) -reg.register_pattern("nn.avg_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE) # adaptive_max_pool1d reg.register_schedule("nn.adaptive_max_pool1d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.adaptive_max_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE) # adaptive_avg_pool1d reg.register_schedule("nn.adaptive_avg_pool1d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.adaptive_avg_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE) # global_max_pool2d reg.register_schedule("nn.global_max_pool2d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_convert_op_layout("nn.global_max_pool2d") @@ -641,7 +602,6 @@ def convert_global_max_pool2d(attrs, inputs, tinfos, desired_layouts): # global_avg_pool2d reg.register_schedule("nn.global_avg_pool2d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_convert_op_layout("nn.global_avg_pool2d") @@ -670,37 +630,30 @@ def convert_global_avg_pool2d(attrs, inputs, tinfos, desired_layouts): # adaptive_max_pool2d reg.register_schedule("nn.adaptive_max_pool2d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # adaptive_avg_pool2d reg.register_schedule("nn.adaptive_avg_pool2d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # adaptive_max_pool3d reg.register_schedule("nn.adaptive_max_pool3d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.adaptive_max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE) # adaptive_avg_pool3d reg.register_schedule("nn.adaptive_avg_pool3d", strategy.schedule_adaptive_pool) -reg.register_pattern("nn.adaptive_avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE) # leaky_relu reg.register_broadcast_schedule("nn.leaky_relu") -reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE) # prelu reg.register_broadcast_schedule("nn.prelu") -reg.register_pattern("nn.prelu", OpPattern.BROADCAST) # flatten reg.register_broadcast_schedule("nn.batch_flatten") -reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE) # lrn @@ -712,7 +665,6 @@ def compute_lrn(attrs, inputs, out_dtype): reg.register_schedule("nn.lrn", strategy.schedule_lrn) -reg.register_pattern("nn.lrn", OpPattern.OPAQUE) # upsampling @@ -783,18 +735,13 @@ def mirror_pad_func(attrs, inputs, _): "nn.contrib_conv2d_winograd_without_weight_transform", strategy.conv2d_winograd_without_weight_transfrom_strategy, ) -reg.register_pattern( - "nn.contrib_conv2d_winograd_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE -) + # conv2d_gemm related operators reg.register_strategy( "nn.contrib_conv2d_gemm_without_weight_transform", strategy.conv2d_gemm_without_weight_transform_strategy, ) -reg.register_pattern( - "nn.contrib_conv2d_gemm_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE -) @reg.register_compute("nn.contrib_conv2d_gemm_weight_transform") @@ -807,7 +754,6 @@ def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype): reg.register_schedule( "nn.contrib_conv2d_gemm_weight_transform", strategy.schedule_conv2d_gemm_weight_transform ) -reg.register_pattern("nn.contrib_conv2d_gemm_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform") @@ -821,7 +767,6 @@ def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype): "nn.contrib_conv2d_winograd_weight_transform", strategy.schedule_conv2d_winograd_weight_transform, ) -reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform") @@ -838,21 +783,17 @@ def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_d "nn.contrib_conv2d_winograd_nnpack_weight_transform", strategy.schedule_conv2d_winograd_nnpack_weight_transform, ) -reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform", OpPattern.OPAQUE) # conv2d_NCHWc reg.register_strategy("nn.contrib_conv2d_NCHWc", strategy.conv2d_NCHWc_strategy) -reg.register_pattern("nn.contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE) # depthwise_conv2d_NCHWc reg.register_strategy("nn.contrib_depthwise_conv2d_NCHWc", strategy.depthwise_conv2d_NCHWc_strategy) -reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE) # deformable_conv2d reg.register_strategy("nn.deformable_conv2d", strategy.deformable_conv2d_strategy) -reg.register_pattern("nn.deformable_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_alter_op_layout("nn.deformable_conv2d") @@ -950,12 +891,10 @@ def compute_bitpack(attrs, inputs, out_dtype): reg.register_schedule("nn.bitpack", strategy.schedule_bitpack) -reg.register_pattern("nn.bitpack", OpPattern.INJECTIVE) # bitserial_conv2d reg.register_strategy("nn.bitserial_conv2d", strategy.bitserial_conv2d_strategy) -reg.register_pattern("nn.bitserial_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.bitserial_conv2d") @@ -981,7 +920,6 @@ def legalize_bitserial_conv2d(attrs, inputs, types): # bitserial_dense reg.register_strategy("nn.bitserial_dense", strategy.bitserial_dense_strategy) -reg.register_pattern("nn.bitserial_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # cross_entropy @@ -992,7 +930,6 @@ def compute_cross_entropy(attrs, inputs, out_dtype): reg.register_reduce_schedule("nn.cross_entropy") -reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE) # dilate @@ -1002,7 +939,6 @@ def compute_dilate(attrs, inputs, out_dtype): reg.register_broadcast_schedule("nn.dilate") -reg.register_pattern("nn.dilate", OpPattern.INJECTIVE) # cross_entropy_with_logits @@ -1013,7 +949,6 @@ def compute_cross_entropy_with_logits(attrs, inputs, out_dtype): reg.register_reduce_schedule("nn.cross_entropy_with_logits") -reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE) # nll_loss @@ -1024,7 +959,6 @@ def compute_nll_loss(attrs, inputs, out_dtype): reg.register_reduce_schedule("nn.nll_loss") -reg.register_pattern("nn.nll_loss", OpPattern.OUT_ELEMWISE_FUSABLE) # depth_to_space @@ -1037,7 +971,6 @@ def compute_depth_to_space(attrs, inputs, out_dtype): reg.register_injective_schedule("nn.depth_to_space") -reg.register_pattern("nn.depth_to_space", OpPattern.INJECTIVE) # space_to_depth @@ -1049,12 +982,10 @@ def compute_space_to_depth(attrs, inputs, out_dtype): reg.register_injective_schedule("nn.space_to_depth") -reg.register_pattern("nn.space_to_depth", OpPattern.INJECTIVE) # correlation reg.register_strategy("nn.correlation", strategy.correlation_strategy) -reg.register_pattern("nn.correlation", OpPattern.OUT_ELEMWISE_FUSABLE) # space_to_batch_nd and batch_to_space_nd @@ -1063,7 +994,6 @@ def compute_space_to_depth(attrs, inputs, out_dtype): reg.register_strategy("nn.conv2d_backward_weight", strategy.conv2d_backward_weight_strategy) -reg.register_pattern("nn.conv2d_backward_weight", OpPattern.OUT_ELEMWISE_FUSABLE) @reg.register_legalize("nn.conv2d_backward_weight") diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc index ddb582043ddbb..496aa3514d888 100644 --- a/src/relay/op/nn/bitserial.cc +++ b/src/relay/op/nn/bitserial.cc @@ -113,7 +113,8 @@ efficient implementation of bitserial operations. .set_attrs_type() .add_argument("data", "Tensor", "Input data.") .set_support_level(2) - .add_type_rel("BitPack", BitPackRel); + .add_type_rel("BitPack", BitPackRel) + .set_attr("TOpPattern", kInjective); // relay.nn.bitserial_conv2d TVM_REGISTER_NODE_TYPE(BinaryConv2DAttrs); @@ -192,7 +193,8 @@ on some platforms. .set_support_level(2) .add_type_rel("BinaryConv2D", BinaryConv2DRel) .set_attr("FInferCorrectLayout", - BinaryConv2DInferCorrectLayout); + BinaryConv2DInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.bitserial_dense TVM_REGISTER_NODE_TYPE(BinaryDenseAttrs); @@ -251,7 +253,8 @@ RELAY_REGISTER_OP("nn.bitserial_dense") .add_argument("data", "2D Tensor", "Input data.") .add_argument("weight", "2D Tensor", "Weight matrix.") .set_support_level(1) - .add_type_rel("BinaryDense", BinaryDenseRel); + .add_type_rel("BinaryDense", BinaryDenseRel) + .set_attr("TOpPattern", kOutEWiseFusable); } // namespace relay } // namespace tvm diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc index de14d059df837..dabb1899713f6 100644 --- a/src/relay/op/nn/convolution.cc +++ b/src/relay/op/nn/convolution.cc @@ -173,7 +173,8 @@ with the layer input to produce a tensor of outputs. .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(2) .add_type_rel("Conv1D", Conv1DRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.conv2d TVM_REGISTER_NODE_TYPE(Conv2DAttrs); @@ -404,7 +405,8 @@ with the layer input to produce a tensor of outputs. .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(2) .add_type_rel("Conv2D", Conv2DRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.conv3d TVM_REGISTER_NODE_TYPE(Conv3DAttrs); @@ -577,7 +579,8 @@ with the layer input to produce a tensor of outputs. .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(2) .add_type_rel("Conv3D", Conv3DRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.conv3d_transpose TVM_REGISTER_NODE_TYPE(Conv3DTransposeAttrs); @@ -738,7 +741,8 @@ said convolution. .set_support_level(2) .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) - .add_type_rel("Conv3DTranspose", Conv3DTransposeRel); + .add_type_rel("Conv3DTranspose", Conv3DTransposeRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.conv2d_transpose TVM_REGISTER_NODE_TYPE(Conv2DTransposeAttrs); @@ -906,7 +910,8 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW` .set_support_level(2) .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) - .add_type_rel("Conv2DTranspose", Conv2DTransposeRel); + .add_type_rel("Conv2DTranspose", Conv2DTransposeRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.conv1d_transpose TVM_REGISTER_NODE_TYPE(Conv1DTransposeAttrs); @@ -1042,7 +1047,8 @@ said convolution. .add_argument("data", "Tensor", "The input tensor.") .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(2) - .add_type_rel("Conv1DTranspose", Conv1DTransposeRel); + .add_type_rel("Conv1DTranspose", Conv1DTransposeRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.contrib_conv2d_winograd_without_weight_transform TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs); @@ -1077,7 +1083,8 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform") .set_support_level(10) .add_type_rel("Conv2DWinograd", Conv2DWinogradRel) .set_attr("FInferCorrectLayout", - ConvInferCorrectLayout); + ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.contrib_conv2d_winograd_weight_transform TVM_REGISTER_NODE_TYPE(ConvWinogradWeightTransformAttrs); @@ -1122,7 +1129,8 @@ weight transformation in advance. .set_num_inputs(1) .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) - .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel); + .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.contrib_conv3d_winograd_without_weight_transform TVM_REGISTER_NODE_TYPE(Conv3DWinogradAttrs); @@ -1239,7 +1247,8 @@ RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform") .set_support_level(10) .add_type_rel("Conv3DWinograd", Conv3DWinogradRel) .set_attr("FInferCorrectLayout", - ConvInferCorrectLayout); + ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.contrib_conv3d_winograd_weight_transform TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform") @@ -1289,7 +1298,8 @@ weight transformation in advance. .set_num_inputs(1) .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) - .add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel); + .add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs); @@ -1347,7 +1357,8 @@ weight transformation in advance. .set_num_inputs(1) .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) - .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel); + .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel) + .set_attr("TOpPattern", kOpaque); // relay.nn.contrib_conv2d_gemm_without_weight_transform TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_without_weight_transform") @@ -1449,7 +1460,8 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform") .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) .add_type_rel("Conv2DGemm", Conv2DGemmRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.contrib_conv2d_gemm_weight_transform @@ -1531,7 +1543,8 @@ weight transformation in advance. .set_num_inputs(1) .add_argument("weights", "Tensor", "The weights tensor.") .set_support_level(10) - .add_type_rel("Conv2DGemmWeightTransform", Conv2DGemmWeightTransformRel); + .add_type_rel("Conv2DGemmWeightTransform", Conv2DGemmWeightTransformRel) + .set_attr("TOpPattern", kOutEWiseFusable); // Positional relay function to create conv2d NCHWc operator // used by frontend FFI. @@ -1558,7 +1571,8 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc") .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) .add_type_rel("Conv2DNCHWc", Conv2DWinogradRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // Positional relay function to create depthwise conv2d NCHWc operator // used by frontend FFI. @@ -1585,7 +1599,8 @@ RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc") .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(10) .add_type_rel("Conv2D", Conv2DRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); TVM_REGISTER_NODE_TYPE(DeformableConv2DAttrs); @@ -1738,7 +1753,8 @@ by concating all the *g* results. .add_argument("weight", "Tensor", "The weight tensor.") .set_support_level(5) .add_type_rel("DeformableConv2D", DeformableConv2DRel) - .set_attr("FInferCorrectLayout", DeformableConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", DeformableConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); // Positional relay function to create deformable_conv2d operator // used by frontend FFI. @@ -1858,7 +1874,8 @@ given the original input data and the output gradient. .add_argument("data", "Tensor", "The input tensor.") .set_support_level(2) .add_type_rel("Conv2DBackwardWeight", Conv2DBackwardWeightRel) - .set_attr("FInferCorrectLayout", ConvInferCorrectLayout); + .set_attr("FInferCorrectLayout", ConvInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable); } // namespace relay } // namespace tvm diff --git a/src/relay/op/nn/correlation.cc b/src/relay/op/nn/correlation.cc index 51b2eb55db7af..8abc9909e83c7 100644 --- a/src/relay/op/nn/correlation.cc +++ b/src/relay/op/nn/correlation.cc @@ -129,7 +129,8 @@ where :math:`i` and :math:`j` enumerate spatial locations in :math:`f_{1}`, and .add_argument("data2", "Tensor", "Input data2 to the correlation.") .set_support_level(2) .set_attr("FInferCorrectLayout", CorrelationInferCorrectLayout) - .add_type_rel("Correlation", CorrelationRel); + .add_type_rel("Correlation", CorrelationRel) + .set_attr("TOpPattern", kOutEWiseFusable); } // namespace relay } // namespace tvm diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index e3e3bfbb973e5..9e73c6456401e 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -95,6 +95,7 @@ RELAY_REGISTER_OP("nn.bias_add") .add_argument("bias", "1D Tensor", "Bias.") .set_support_level(1) .add_type_rel("BiasAdd", BiasAddRel) + .set_attr("TOpPattern", kBroadcast) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); @@ -160,7 +161,8 @@ Useful for .add_argument("data", "Tensor", "Latest input") .add_argument("buffer", "Tensor", "Buffer storing latest [length_buffer] inputs") .set_support_level(3) - .add_type_rel("FIFOBuffer", FIFOBufferRel); + .add_type_rel("FIFOBuffer", FIFOBufferRel) + .set_attr("TOpPattern", kOpaque); // ------------------- relay.nn.matmul TVM_REGISTER_NODE_TYPE(MatmulAttrs); @@ -191,7 +193,9 @@ RELAY_REGISTER_OP("nn.matmul") .add_argument("tensor_a", "nD Tensor", "The first input Tensor.") .add_argument("tensor_b", "2D Tensor", "The second input Tensor.") .set_support_level(1) - .add_type_rel("Matmul", MatmulRel); + .add_type_rel("Matmul", MatmulRel) + .set_attr("TOpPattern", kOutEWiseFusable); + // ------------------- relay.nn.matmul // ------------------- relay.nn.dense @@ -229,7 +233,8 @@ RELAY_REGISTER_OP("nn.dense") .add_argument("weight", "2D Tensor", "Weight matrix.") .set_support_level(1) .set_attr("FInferCorrectLayout", DenseInferCorrectLayout) - .add_type_rel("Dense", MatmulRel); + .add_type_rel("Dense", MatmulRel) + .set_attr("TOpPattern", kOutEWiseFusable); // ------------------- relay.nn.dense // ------------------- relay.nn.contrib_dense_pack @@ -296,7 +301,9 @@ RELAY_REGISTER_OP("nn.contrib_dense_pack") .add_argument("weight", "3D Tensor", "Packed weight matrix.") .set_support_level(10) .set_attr("FInferCorrectLayout", DensePackInferCorrectLayout) - .add_type_rel("DensePack", DensePackRel); + .add_type_rel("DensePack", DensePackRel) + .set_attr("TOpPattern", kOutEWiseFusable); + // ------------------- relay.nn.contrib_dense_pack // relay.leaky_relu @@ -324,6 +331,7 @@ RELAY_REGISTER_OP("nn.leaky_relu") .set_support_level(3) .add_type_rel("Identity", IdentityRel) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) + .set_attr("TOpPattern", kElemWise) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); @@ -390,6 +398,7 @@ where :math:`*` is an channelwise multiplication for each sample in the batch. .set_support_level(3) .add_type_rel("PRelu", PReluRel) .set_attr("FInferCorrectLayout", PReluInferCorrectLayout) + .set_attr("TOpPattern", kBroadcast) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); @@ -441,7 +450,8 @@ RELAY_REGISTER_OP("nn.softmax") .set_num_inputs(1) .add_argument("data", "Tensor", "The input tensor.") .set_support_level(1) - .add_type_rel("Softmax", SoftmaxRel); + .add_type_rel("Softmax", SoftmaxRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.fast_softmax TVM_REGISTER_NODE_TYPE(SoftmaxAttrs); @@ -468,7 +478,8 @@ RELAY_REGISTER_OP("nn.fast_softmax") .set_num_inputs(1) .add_argument("data", "Tensor", "The input tensor.") .set_support_level(1) - .add_type_rel("Softmax", SoftmaxRel); + .add_type_rel("Softmax", SoftmaxRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.log_softmax TVM_REGISTER_GLOBAL("relay.op.nn._make.log_softmax").set_body_typed([](Expr data, int axis) { @@ -493,6 +504,7 @@ RELAY_REGISTER_OP("nn.log_softmax") .add_argument("data", "Tensor", "The input tensor.") .set_support_level(1) .add_type_rel("Softmax", SoftmaxRel) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); @@ -561,6 +573,7 @@ Example:: .add_argument("data", "Tensor", "The input tensor.") .set_support_level(2) .add_type_rel("BatchFlatten", BatchFlattenRel) + .set_attr("TOpPattern", kInjective) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { @@ -586,6 +599,7 @@ RELAY_REGISTER_OP("nn.relu") .set_support_level(1) .add_type_rel("Identity", IdentityRel) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) + .set_attr("TOpPattern", kElemWise) .set_attr("FTVMCompute", [](const Attrs& attrs, const Array& inputs, const Type& out_type) { return Array{topi::relu(inputs[0], 0.0f)}; @@ -625,7 +639,8 @@ centered at that value (zero padding is added where necessary). .set_num_inputs(1) .add_argument("data", "Tensor", "The input tensor.") .set_support_level(2) - .add_type_rel("Identity", IdentityRel); + .add_type_rel("Identity", IdentityRel) + .set_attr("TOpPattern", kOpaque); // Positional relay function to create L2Normalize operator used by frontend FFI. TVM_REGISTER_NODE_TYPE(L2NormalizeAttrs); @@ -693,8 +708,8 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input .set_num_inputs(1) .add_argument("data", "Tensor", "Input to which dropout will be applied.") .set_support_level(1) - .set_attr("TOpPattern", kOpaque) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) + .set_attr("TOpPattern", kOpaque) .add_type_rel("Dropout", DropoutRel) .set_attr("TOpIsStateful", true); @@ -826,7 +841,8 @@ axis to be the last item in the input shape. .add_argument("moving_var", "Tensor", "Running variance of input.") .set_attr("FInferCorrectLayout", BatchNormInferCorrectLayout) .set_support_level(1) - .add_type_rel("BatchNorm", BatchNormRel); + .add_type_rel("BatchNorm", BatchNormRel) + .set_attr("TOpPattern", kOutEWiseFusable); // instance_norm TVM_REGISTER_NODE_TYPE(InstanceNormAttrs); @@ -1077,7 +1093,9 @@ Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT f .add_argument("tensor_a", "3D Tensor", "The first input.") .add_argument("tensor_b", "3D Tensor", "The second input.") .set_support_level(10) - .add_type_rel("BatchMatmul", BatchMatmulRel); + .add_type_rel("BatchMatmul", BatchMatmulRel) + .set_attr("TOpPattern", kOutEWiseFusable); + // ------------------- relay.nn.batch_matmul // relay.nn.cross_entropy @@ -1121,7 +1139,8 @@ Do log on the data - do not accept logits. .add_argument("x", "1D Tensor", "Predictions.") .add_argument("y", "1D Tensor", "Targets.") .set_support_level(10) - .add_type_rel("CrossEntropy", CrossEntropyRel); + .add_type_rel("CrossEntropy", CrossEntropyRel) + .set_attr("TOpPattern", kOpaque); // relay.nn.dilate TVM_REGISTER_NODE_TYPE(DilateAttrs); @@ -1165,7 +1184,8 @@ Dilate data with given dilation value (0 by default). .set_num_inputs(1) .add_argument("x", "1D Tensor", "Data to dilate.") .set_support_level(10) - .add_type_rel("Dilate", DilateRel); + .add_type_rel("Dilate", DilateRel) + .set_attr("TOpPattern", kInjective); // relay.nn.cross_entropy_with_logits // Positional relay function to create cross_entropy_with_logits operator used by frontend FFI. @@ -1186,7 +1206,8 @@ Accept logits. .add_argument("x", "1D Tensor", "Predictions.") .add_argument("y", "1D Tensor", "Targets.") .set_support_level(10) - .add_type_rel("CrossEntropy", CrossEntropyRel); + .add_type_rel("CrossEntropy", CrossEntropyRel) + .set_attr("TOpPattern", kOpaque); // Depth to space and space to depth TVM_REGISTER_NODE_TYPE(SubPixelAttrs); @@ -1269,7 +1290,8 @@ Negative log likelihood loss for given prediction and target. .add_argument("predictions", "Tensor", "The prediction tensor.") .add_argument("targets", "Tensor", "The target tensor.") .add_argument("weights", "Tensor", "The weight of each target values.") - .add_type_rel("NLLLoss", NLLLossRel); + .add_type_rel("NLLLoss", NLLLossRel) + .set_attr("TOpPattern", kOutEWiseFusable); bool DepthToSpaceRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { @@ -1332,7 +1354,8 @@ RELAY_REGISTER_OP("nn.depth_to_space") .set_num_inputs(1) .add_argument("data", "Tensor", "The input tensor") .set_support_level(5) - .add_type_rel("DepthToSpace", DepthToSpaceRel); + .add_type_rel("DepthToSpace", DepthToSpaceRel) + .set_attr("TOpPattern", kInjective); bool SpaceToDepthRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { @@ -1394,7 +1417,8 @@ RELAY_REGISTER_OP("nn.space_to_depth") .set_num_inputs(1) .add_argument("data", "Tensor", "The input tensor") .set_support_level(5) - .add_type_rel("SpaceToDepth", SpaceToDepthRel); + .add_type_rel("SpaceToDepth", SpaceToDepthRel) + .set_attr("TOpPattern", kInjective); // Positional relay function to create SpaceToBatchND operator // used by frontend FFI diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc index cf44b308ce023..c56039f13ba3a 100644 --- a/src/relay/op/nn/pooling.cc +++ b/src/relay/op/nn/pooling.cc @@ -216,6 +216,7 @@ RELAY_REGISTER_OP("nn.max_pool2d") .set_support_level(2) .add_type_rel("MaxPool2D", Pool2DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool2DCompute); // AvgPool2D @@ -255,6 +256,7 @@ Average pooling operation for one dimensional data. .set_support_level(2) .add_type_rel("AvgPool2D", Pool2DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool2DCompute); // relay.nn.global_pool_2d & relay.nn.max_pool_2d @@ -335,6 +337,7 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d") .set_support_level(2) .add_type_rel("GlobalAvgPool2D", GlobalPool2DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", GlobalPool2DCompute); // GlobalMaxPool @@ -363,6 +366,7 @@ RELAY_REGISTER_OP("nn.global_max_pool2d") .set_support_level(2) .add_type_rel("GlobalMaxPool2D", GlobalPool2DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", GlobalPool2DCompute); // relay.nn.adaptive_pool_1d @@ -463,6 +467,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool1d") .add_type_rel("AdaptiveAvgPool1D", AdaptivePool1DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", AdaptivePool1DCompute); // relay.nn.adaptive_max_pool1d @@ -498,6 +503,7 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool1d") .add_type_rel("AdaptiveMaxPool1D", AdaptivePool1DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", AdaptivePool1DCompute); // relay.nn.adaptive_pool_2d @@ -617,6 +623,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool2d") .add_type_rel("AdaptiveAvgPool2D", AdaptivePool2DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", AdaptivePool2DCompute); // relay.nn.adaptive_max_pool2d @@ -654,6 +661,7 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool2d") .add_type_rel("AdaptiveMaxPool2D", AdaptivePool2DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", AdaptivePool2DCompute); // relay.nn.adaptive_pool3d @@ -788,6 +796,7 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool3d") .add_type_rel("AdaptiveMaxPool3D", AdaptivePool3DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", AdaptivePool3DCompute); // relay.nn.adaptive_max_pool3d @@ -823,6 +832,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool3d") .add_type_rel("AdaptiveAvgPool3D", AdaptivePool3DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", AdaptivePool3DCompute); bool Pool2DGradRel(const Array& types, int num_inputs, const Attrs& attrs, @@ -930,6 +940,7 @@ RELAY_REGISTER_OP("nn.max_pool2d_grad") .add_argument("grad", "Tensor", "The grad tensor.") .set_support_level(2) .add_type_rel("MaxPool2DGrad", Pool2DGradRel) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool2DGradCompute); // AvgPool2DGrad @@ -979,6 +990,7 @@ RELAY_REGISTER_OP("nn.avg_pool2d_grad") .add_argument("grad", "Tensor", "The grad tensor.") .set_support_level(2) .add_type_rel("MaxPool2DGrad", Pool2DGradRel) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool2DGradCompute); // relay.nn.max_pool1d & relay.nn.avg_pool1d @@ -1101,6 +1113,7 @@ RELAY_REGISTER_OP("nn.max_pool1d") .set_support_level(2) .add_type_rel("MaxPool1D", Pool1DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool1DCompute); // AvgPool1D @@ -1138,6 +1151,7 @@ Average pooling operation for one dimensional data. .set_support_level(2) .add_type_rel("AvgPool1D", Pool1DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool1DCompute); // relay.nn.max_pool3d & relay.nn.avg_pool3d @@ -1291,6 +1305,7 @@ RELAY_REGISTER_OP("nn.max_pool3d") .set_support_level(2) .add_type_rel("MaxPool3D", Pool3DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool3DCompute); // AvgPool3D @@ -1331,6 +1346,7 @@ Average pooling operation for three dimensional data. .set_support_level(2) .add_type_rel("AvgPool3D", Pool3DRel) .set_attr("FInferCorrectLayout", PoolInferCorrectLayout) + .set_attr("TOpPattern", kOutEWiseFusable) .set_attr("FTVMCompute", Pool3DCompute); } // namespace relay diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc index 7d21005cb4dbc..e190a8b886e11 100644 --- a/src/relay/op/nn/sparse.cc +++ b/src/relay/op/nn/sparse.cc @@ -120,7 +120,8 @@ RELAY_REGISTER_OP("nn.sparse_dense") .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.") .add_argument("sparse_indptr", "1D Tensor", "Sparse indptr matrix.") .set_support_level(1) - .add_type_rel("SparseDense", SparseDenseRel); + .add_type_rel("SparseDense", SparseDenseRel) + .set_attr("TOpPattern", kOutEWiseFusable); Expr MakeSparseDensePadded(Expr data, Expr weight_data, Expr weight_indices, Expr weight_indptr) { auto attrs = make_object(); @@ -151,7 +152,8 @@ which will be converted to this op when running on the GPU. .add_argument("weight_indices", "1D Tensor", "Weight indices matrix.") .add_argument("weight_indptr", "1D Tensor", "Weight indptr matrix.") .set_support_level(1) - .add_type_rel("SparseDense", SparseDenseRel); + .add_type_rel("SparseDense", SparseDenseRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.sparse_transpose TVM_REGISTER_NODE_TYPE(SparseTransposeAttrs); @@ -195,7 +197,8 @@ RELAY_REGISTER_OP("nn.sparse_transpose") .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.") .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer matrix.") .set_support_level(1) - .add_type_rel("SparseTranspose", SparseTransposeRel); + .add_type_rel("SparseTranspose", SparseTransposeRel) + .set_attr("TOpPattern", kOutEWiseFusable); // relay.nn.sparse_add bool SparseAddRel(const Array& types, int num_inputs, const Attrs& attrs, @@ -236,7 +239,8 @@ RELAY_REGISTER_OP("nn.sparse_add") .add_argument("sparse_indices", "1D Tensor", "Sparse indices vector.") .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer vector.") .set_support_level(1) - .add_type_rel("SparseAdd", SparseAddRel); + .add_type_rel("SparseAdd", SparseAddRel) + .set_attr("TOpPattern", kOpaque); TVM_REGISTER_NODE_TYPE(SparseConv2DAttrs); @@ -300,7 +304,8 @@ RELAY_REGISTER_OP("nn.sparse_conv2d") .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.") .add_argument("sparse_indptr", "1D Tensor", "Sparse indptr matrix.") .set_support_level(1) - .add_type_rel("SparseConv2d", SparseConv2dRel); + .add_type_rel("SparseConv2d", SparseConv2dRel) + .set_attr("TOpPattern", kOutEWiseFusable); } // namespace relay } // namespace tvm From d07f2fb8056413cc844ce61a883bf4f51fd42848 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Tue, 12 Jul 2022 15:04:04 -0700 Subject: [PATCH 103/111] [MetaSchedule][Test] Add unittests for DIL (#12077) --- .../unittest/test_meta_schedule_space_cpu.py | 179 ++++++++++++++++++ .../unittest/test_meta_schedule_space_cuda.py | 89 +++++++++ 2 files changed, 268 insertions(+) diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py index d757d4bef71df..36f365e732524 100644 --- a/tests/python/unittest/test_meta_schedule_space_cpu.py +++ b/tests/python/unittest/test_meta_schedule_space_cpu.py @@ -901,9 +901,188 @@ def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T. ) +def test_cpu_dil(): + # fmt: off + @T.prim_func + def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32") + conv2d_nhwc_global = T.alloc_buffer([1, 109, 109, 64], dtype="float32") + for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1, i3_1 in T.grid(1, 109, 1, 4, 1, 1, 1, 2): + for ax0, ax1, ax2, ax3 in T.grid(1, 13, 229, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(230, i1_0 * 2 + ax1) + i2 = T.axis.spatial(230, ax2) + i3 = T.axis.spatial(3, ax3) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3]) + T.writes(PadInput[i0, i1, i2, i3]) + PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32") + for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(7, 1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3) + w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3) + co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3) + rh = T.axis.reduce(7, i4_0 + i4_1) + rw = T.axis.reduce(7, i5_0 * 7 + i5_1) + rc = T.axis.reduce(3, i6_0 * 3 + i6_1) + T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc_global[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_nhwc_global[n, h, w, co] = T.float32(0) + conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight[rh, rw, rc, co] + for ax0, ax1, ax2, ax3 in T.grid(1, 1, 109, 8): + with T.block("conv2d_nhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(109, i1_0 + ax1) + v2 = T.axis.spatial(109, ax2) + v3 = T.axis.spatial(64, i3_0 * 16 + i3_1 * 8 + ax3) + T.reads(conv2d_nhwc_global[v0, v1, v2, v3]) + T.writes(conv2d_nhwc[v0, v1, v2, v3]) + conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3] + @T.prim_func + def dil_1(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32") + conv2d_nhwc_global = T.alloc_buffer([1, 109, 109, 64], dtype="float32") + for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 109, 1, 4): + for i0_1, i1_1, i2_1, i3_1, i4_0 in T.grid(1, 1, 1, 2, 7): + for ax0, ax1, ax2, ax3 in T.grid(1, 1, 229, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(230, i1_0 * 2 + i4_0 * 2 + ax1) + i2 = T.axis.spatial(230, ax2) + i3 = T.axis.spatial(3, ax3) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3]) + T.writes(PadInput[i0, i1, i2, i3]) + PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32") + for i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3) + w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3) + co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3) + rh = T.axis.reduce(7, i4_0 + i4_1) + rw = T.axis.reduce(7, i5_0 * 7 + i5_1) + rc = T.axis.reduce(3, i6_0 * 3 + i6_1) + T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc_global[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_nhwc_global[n, h, w, co] = T.float32(0) + conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight[rh, rw, rc, co] + for ax0, ax1, ax2, ax3 in T.grid(1, 1, 109, 16): + with T.block("conv2d_nhwc_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(109, i1_0 + ax1) + v2 = T.axis.spatial(109, ax2) + v3 = T.axis.spatial(64, i3_0 * 16 + ax3) + T.reads(conv2d_nhwc_global[v0, v1, v2, v3]) + T.writes(conv2d_nhwc[v0, v1, v2, v3]) + conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3] + @T.prim_func + def dil_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64}) + PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32") + for i0_0, i1_0 in T.grid(1, 109): + for ax0, ax1, ax2, ax3 in T.grid(1, 13, 229, 3): + with T.block("PadInput"): + i0 = T.axis.spatial(1, ax0) + i1 = T.axis.spatial(230, i1_0 * 2 + ax1) + i2 = T.axis.spatial(230, ax2) + i3 = T.axis.spatial(3, ax3) + T.reads(inputs[i0, i1 - 3, i2 - 3, i3]) + T.writes(PadInput[i0, i1, i2, i3]) + PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32") + for i2_0, i3_0, i0_1, i1_1, i2_1, i3_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 1, 1, 1, 2, 7, 1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0) + h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3) + w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3) + co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3) + rh = T.axis.reduce(7, i4_0 + i4_1) + rw = T.axis.reduce(7, i5_0 * 7 + i5_1) + rc = T.axis.reduce(3, i6_0 * 3 + i6_1) + T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + with T.init(): + conv2d_nhwc[n, h, w, co] = T.float32(0) + conv2d_nhwc[n, h, w, co] = conv2d_nhwc[n, h, w, co] + PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight[rh, rw, rc, co] + + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [109, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 109, 1]), + ("SamplePerfectTile", [4, 2, 8, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [1, 7]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 2), + ("SampleComputeLocation", 7), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [109, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 109, 1]), + ("SamplePerfectTile", [4, 2, 8, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [1, 7]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 0), + ("SampleComputeLocation", 8), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [109, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 109, 1]), + ("SamplePerfectTile", [4, 2, 8, 1]), + ("SamplePerfectTile", [7, 1]), + ("SamplePerfectTile", [1, 7]), + ("SamplePerfectTile", [1, 3]), + ("SampleCategorical", 0), + ("SampleComputeLocation", 1), + ] + mod = create_te_workload("DIL", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[dil_0, dil_1, dil_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + if __name__ == "__main__": test_cpu_c1d() test_cpu_c2d() test_cpu_c3d() test_cpu_cap() test_cpu_dep() + test_cpu_dil() diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index 826a1ca062b58..b8723e286aef3 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -484,9 +484,98 @@ def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T. ) +def test_cuda_dil(): + # fmt: off + @T.prim_func + def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr({"meta_schedule.unroll_explicit":512}) + conv2d_nhwc_local = T.alloc_buffer([1, 109, 109, 64], dtype="float32", scope="local") + PadInput_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared") + weight_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared") + for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(218, thread="blockIdx.x"): + for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(109, thread="vthread.x"): + for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(1, thread="threadIdx.x"): + for i4_0, i5_0, i6_0 in T.grid(7, 7, 3): + for ax0_ax1_ax2_ax3_fused in T.serial(217): + with T.block("PadInput_shared"): + v0 = T.axis.spatial(1, 0) + v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 2 * 2 + i4_0 * 2 + 0) + v2 = T.axis.spatial(230, i5_0 * 2 + ax0_ax1_ax2_ax3_fused % 217) + v3 = T.axis.spatial(3, i6_0 + 0) + T.reads(inputs[v0, v1 - 3, v2 - 3, v3]) + T.writes(PadInput_shared[v0, v1, v2, v3]) + T.block_attr({"meta_schedule.cooperative_fetch":2}) + PadInput_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, inputs[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32") + for ax0_ax1_ax2_ax3_fused in T.serial(32): + with T.block("weight_shared"): + v0, v1, v2 = T.axis.remap("SSS", [i4_0, i5_0, i6_0]) + v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + ax0_ax1_ax2_ax3_fused) + T.reads(weight[v0, v1, v2, v3]) + T.writes(weight_shared[v0, v1, v2, v3]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3] + for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 4): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0) + h = T.axis.spatial(109, i0_0_i1_0_i2_0_i3_0_fused % 218 // 2 + 0 + 0 + i1_3 + i1_4) + w = T.axis.spatial(109, 0 * 109 + i0_1_i1_1_i2_1_i3_1_fused % 109 + 0 + i2_3 + i2_4) + co = T.axis.spatial(64, ((i0_0_i1_0_i2_0_i3_0_fused % 2 + 0 + 0) * 8 + i3_3) * 4 + i3_4) + rh = T.axis.reduce(7, i4_0 + i4_1 + i4_2) + rw = T.axis.reduce(7, i5_0 + i5_1 + i5_2) + rc = T.axis.reduce(3, i6_0 + i6_1 + i6_2) + T.reads(PadInput_shared[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight_shared[rh, rw, rc, co]) + T.writes(conv2d_nhwc_local[n, h, w, co]) + T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"}) + with T.init(): + conv2d_nhwc_local[n, h, w, co] = T.float32(0) + conv2d_nhwc_local[n, h, w, co] = conv2d_nhwc_local[n, h, w, co] + PadInput_shared[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight_shared[rh, rw, rc, co] + for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 32): + with T.block("conv2d_nhwc_local"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(109, i0_0_i1_0_i2_0_i3_0_fused // 2 + ax1) + v2 = T.axis.spatial(109, i0_1_i1_1_i2_1_i3_1_fused + ax2) + v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + ax3) + T.reads(conv2d_nhwc_local[v0, v1, v2, v3]) + T.writes(conv2d_nhwc[v0, v1, v2, v3]) + conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [109, 1, 1, 1, 1]), + ("SamplePerfectTile", [1, 109, 1, 1, 1]), + ("SamplePerfectTile", [2, 1, 1, 8, 4]), + ("SamplePerfectTile", [7, 1, 1]), + ("SamplePerfectTile", [7, 1, 1]), + ("SamplePerfectTile", [3, 1, 1]), + ("SampleCategorical", 1), + ("SampleCategorical", 3), + ("SampleCategorical", 3), + ] + mod = create_te_workload("DIL", 0) + actual = ms.TuneContext( + mod=mod, + target=_target(), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules="default", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[dil_0], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": test_cuda_c1d() test_cuda_c2d() test_cuda_c3d() test_cuda_cap() test_cuda_dep() + test_cuda_dil() From 1d5d357d69ab8a0ceae047532ab778359676cda7 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 12 Jul 2022 15:09:58 -0700 Subject: [PATCH 104/111] [Hexagon] Enable broken tests (#12073) --- .../contrib/test_hexagon/topi/test_add_subtract_multiply.py | 3 --- tests/python/contrib/test_hexagon/topi/test_argmax_slice.py | 3 --- tests/python/contrib/test_hexagon/topi/test_resize2d.py | 3 --- tests/python/contrib/test_hexagon/topi/test_softmax_slice.py | 3 --- 4 files changed, 12 deletions(-) diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py index 0d8126072955a..606aa628d0097 100755 --- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py +++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py @@ -158,9 +158,6 @@ def test_transform( input_B_layout, op_name, ): - if hexagon_session._launcher._serial_number != "simulator": - pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") - target_hexagon = tvm.target.hexagon("v69") A = te.placeholder(input_shape_A, name="A", dtype=dtype) B = te.placeholder(input_shape_B, name="B", dtype=dtype) diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py index 5431054d2ca22..eaba9fafde3af 100644 --- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py @@ -77,9 +77,6 @@ def test_argmax_slice( working_scope, ): """Top level testing function for argmax""" - if hexagon_session._launcher._serial_number != "simulator": - pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") - target_hexagon = tvm.target.hexagon("v69") target = tvm.target.Target(target_hexagon, host=target_hexagon) argmax_input = te.placeholder(input_shape, name="A", dtype=dtype) diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py index 109eb5c4365d6..d0c2c1464a959 100755 --- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py +++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py @@ -102,9 +102,6 @@ def test_resize2d( method, hexagon_session, ): - if hexagon_session._launcher._serial_number != "simulator": - pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") - target_hexagon = tvm.target.hexagon("v69") A = te.placeholder(input_shape, name="A", dtype=dtype) diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py index a39c6cd5163bc..9bbecdd7f81bd 100644 --- a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py @@ -79,9 +79,6 @@ def test_softmax_f32( axis_sep, hexagon_session, ): - if hexagon_session._launcher._serial_number != "simulator": - pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") - target_hexagon = tvm.target.hexagon( "v69", llvm_options="--disable-loop-unrolling-pass", From 3992d2443acba6a824ec4da58bfc30f9e0e5d5b5 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:15:10 -0700 Subject: [PATCH 105/111] [COMMUNITY] Add driazati key for release (#12076) As per https://tvm.apache.org/docs/contribute/release_process.html#id3 --- KEYS | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/KEYS b/KEYS index a819d8f3bdda8..41a620e796bc8 100644 --- a/KEYS +++ b/KEYS @@ -416,3 +416,51 @@ A1PPxm4/KsXX/IZZOuM/tlT0vAahQsvXMNUVMg7v/PWuB6V47UdenKpXd10oloF7 MMtVW5sxG8OoBpUIhJUCtYTlwGCyGWSR7+rsHSR2HydLk1RWcYNI3XgJ0ng= =+gLd -----END PGP PUBLIC KEY BLOCK----- +pub rsa3072 2022-07-12 [SC] [expires: 2024-07-11] + B3C6A14C13B8C6727BC2FD2F07FA463F1C926F48 +uid [ultimate] David Riazati +sig 3 07FA463F1C926F48 2022-07-12 David Riazati +sub rsa3072 2022-07-12 [E] [expires: 2024-07-11] +sig 07FA463F1C926F48 2022-07-12 David Riazati + +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQGNBGLNzqUBDAC9p7OMYiiVHTQIIUr1/fDXaJ3sJ0rlkaPQJpPBrtuqGjN5utDu +26BWQqPxx36aABw44UmTRwV4UNf+3McYSJoCODfVpHOKsKk0Ql5CDzG3Ngpdu9ZR +UxV6s2DNHkSUjpd5vRfZF09WnQ0WITEhKz8Wnm82B/NkvRmTzYqlpP+zOT3+WPFh +5maMPOP0bvEfiT22zQqOOyKraYPrtf5ZBSip1fYohOlyS/aJcqOChMuKMOBVrxqH +9EmHjEkN0a+nAdWnGmCoGZONsD4ifXL17AUOaGSpEko6Nj7nXyTKI0laBhj6f8uw +v8M3xDBkIm7oiTuwrCeDa4e9YtP6Vzvj6MxrpNIMN0XRs/DRYH0lgTI1Zv/0SzkO +OAa9tOCiq95jkMjZik/vyQ55WwkMgYDmngsP/PBEW2ztdVLoLeal2p4HNfBM1BQO +RFOGnurR2Vmy1jGPyfpuBNMyjRgFC43s7SLiTYKCi1QxyY5u6dRgjIxkG+jyiY3B +GFMAtPt5iJHUox0AEQEAAbQjRGF2aWQgUmlhemF0aSA8ZHJpYXphdGlAYXBhY2hl +Lm9yZz6JAdQEEwEKAD4WIQSzxqFME7jGcnvC/S8H+kY/HJJvSAUCYs3OpQIbAwUJ +A8JnAAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRAH+kY/HJJvSJHEC/wMgDH/ +jBI6AciNp9sPv2p8tFRDywq2nUOameYFaIMq1r644IAzUUU3PSACMT1lCxGjFFsE +vJYx1GORrpqjwArrK1D3zZ0yb4X38DFAU+7DTGEKzKoz3h+Ka0GyOe90CI/KqqWL +XNeePwvOzIWhZ0U8vqUkgXwHyfG1dwocEx5A1zlTeznkth2AnRELnjhFcj28V2VX +dUQmZ8qOYxXtjSk9xJtQ/BbARiNINeKqzG1aPWgjTtFFp3UTl/jWCr5RBlWMA+BU +N9alE/ozRPx89Uilz2reaC7xX8tHv5F+P7SPVwMhJyYQ7F577CtM0b4vTu4U15wE +VlWF25ymTbSt5kam9jFbeR0Zkc0/LuLEdGWRGbDFI9Hj1rGeBejTm+PjwK3TidDn +KbvpUgvseNfqUQPcbjEsuwYVUtR/LEeQxt2tK/odQwWlHR7BQApFhV7VSJVP99Fp +YNFN7AsiD7+k4fOl5Qeq/t6X7x+gXMkxsRvtJMwB/fTAWbuBxdQBdIkP/KC5AY0E +Ys3OpQEMALhC8woP92ONpgRKHhH3s65cY4EYLhfhkbOqU8KcbPJX0qx1gM68jWqm +aCvez9KO+aB2jEyWG65XsOJXM6RqFtgvFMKG+ETLIgPydqt9l4f5AhnrPXmrxf7l +b8unuFMyoga7DyKnB6hQzEVqZgbKR+U6lWaoFtGTFYlaOdUz268OErrW3592frh0 +VKTdCyBdGPfiwKnzL4+LjU7SuiI9r1nBH5ZYicGmgOKQHP0KQRUy66Cq0S7p0rpp +9owbh2FHkXJ0bryl7AMV5JurEk0FSA483qQjyqHEQCSKVySgUBBFw9UPH0LkUbYv +jk43VFoUYexlJ47KFIRJdQZdLyyqsSy0xzqiCQXFwQPECIFHN/GTMuAHcaCfah/z +u4KDkqArzNzG1pl/DYVuaMo9LmBtzB7kfxPKcvm0atp6WHydcQ92N9ZU9z2zBh7T +u6Akzl+eONsix7F0oldwtG7Glic+1HafyyjhZfV8o6r7rYURnsotDfdzYjpL/xWe +xWkUSv2GbwARAQABiQG8BBgBCgAmFiEEs8ahTBO4xnJ7wv0vB/pGPxySb0gFAmLN +zqUCGwwFCQPCZwAACgkQB/pGPxySb0g+0wv+MQO/9mVo4eblTeFMLpLlU1tbDXIF +n5bDxbd1ekq/fKLrWZpT+MQGprGMXbgTehgeBIMvFvANLr2KHUb4HpXTX1GceVHv +A5uN/JQ+/H+IF3SoipcFPDR67uESVSZQfrky6HG8M9hH4OPdW4LbyEBke13Z2LlK +sQWJFznDnqCqmvLDvvliGBGhMM3RvTn5upgA47gwcJ1Z4xZU+k1nyhAiAgxGxpjO +rtj/Dv7r7gdnDBo5omu0fQLqulSY1UeHsOQXlkR6zMOMDdKgybcScQHQhta0Hcs+ +DWxpfJ92vH/3wGchSA1f0Fp2WCiQ/wp7sfe1esShDN12AwlpDBjK583d0R+DLpVY +8DbRCdvtwIN2f5KD+LhBbBX66AADVKVRIPgGDRGxc85X06nVWOQGHrGD+tCjxBNM +aLLvg9K8HxeWTvQvowCAyFJo4NfIrS/7gMm5JcWMAqVFJ+IVxZNxZUIYV0VBC/AN +rSSBN90DWxIgPhlAqgO0ofkbPSVwF/9i7nd3 +=XBuV +-----END PGP PUBLIC KEY BLOCK----- From ca88c522fb3f85e2d1c968b984af866ac9d1d7b9 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 12 Jul 2022 20:50:02 -0500 Subject: [PATCH 106/111] [LLVM] Update creation of llvm::DebugLoc, remove TVM_LLVM_VERSION < 70 (#12069) * [LLVM] Update creation of llvm::DebugLoc, remove TVM_LLVM_VERSION < 70 * Properly deal with "handle" type * Emit correct subroutine flags * Fix llvm testcase to account for presence of debug metadata --- src/target/llvm/codegen_cpu.cc | 27 ++++++++++++------- .../unittest/test_target_codegen_llvm.py | 6 +++-- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index b19dc216c8930..f2ce6fb848b48 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -196,7 +196,7 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) { // Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { -#if TVM_LLVM_VERSION >= 50 && TVM_LLVM_VERSION < 70 +#if TVM_LLVM_VERSION >= 50 ICHECK(!f_llvm->getSubprogram()); llvm::SmallVector paramTys; // Functions in TIR can only return void or an int. @@ -213,16 +213,20 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType( dbg_info_->di_builder_->getOrCreateTypeArray(paramTys)); + bool local_to_unit = llvm::GlobalValue::isLocalLinkage(f_llvm->getLinkage()); + #if TVM_LLVM_VERSION >= 80 + auto SPFlags = + llvm::DISubprogram::toSPFlags(local_to_unit, /*IsDefinition=*/true, /*IsOptimized=*/true); auto* DIFunction = dbg_info_->di_builder_->createFunction( /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"", /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy, - /*ScopeLine=*/0); + /*ScopeLine=*/0, /*Flags=*/llvm::DINode::FlagZero, /*SPFlags=*/SPFlags); #else auto* DIFunction = dbg_info_->di_builder_->createFunction( /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"", /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy, - /*isLocalToUnit=*/false, /*isDefinition=*/true, /*ScopeLine=*/0, + /*isLocalToUnit=*/local_to_unit, /*isDefinition=*/true, /*ScopeLine=*/0, /*Flags=*/llvm::DINode::FlagPrototyped, /*isOptimized=*/true); #endif @@ -244,9 +248,10 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)), /*alwaysPreserve=*/true); auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca); + auto* di_loc = llvm::DILocation::get(*ctx_, 0, 0, DIFunction); dbg_info_->di_builder_->insertDeclare(paramAlloca, param, dbg_info_->di_builder_->createExpression(), - llvm::DebugLoc::get(0, 0, DIFunction), store); + llvm::DebugLoc(di_loc), store); } dbg_info_->di_builder_->finalizeSubprogram(f_llvm->getSubprogram()); auto* scope = f_llvm->getSubprogram(); @@ -258,7 +263,8 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) { if (I.getDebugLoc()) { continue; } - I.setDebugLoc(llvm::DebugLoc::get(0, 0, scope)); + auto* di_loc = llvm::DILocation::get(*ctx_, 0, 0, scope); + I.setDebugLoc(llvm::DebugLoc(di_loc)); } } #endif @@ -275,10 +281,13 @@ llvm::DIType* CodeGenCPU::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm) return dbg_info_->di_builder_->createBasicType("int32", 32, llvm::dwarf::DW_ATE_signed); } else if (ty_llvm->isPointerTy()) { auto* ptr_type = ty_tir.as(); - ICHECK(ptr_type != nullptr) << "Got LLVM pointer type from non-pointer IR type: " << ty_tir; - Type elem_type = ptr_type->element_type; - return dbg_info_->di_builder_->createPointerType( - GetDebugType(elem_type, GetLLVMType(elem_type)), ty_llvm->getPrimitiveSizeInBits()); + ICHECK(ptr_type != nullptr || GetRuntimeDataType(ty_tir).is_handle()) + << "Got LLVM pointer type from non-pointer IR type: " << ty_tir; + auto* pointee_type = ptr_type != nullptr ? GetDebugType(ptr_type->element_type, + GetLLVMType(ptr_type->element_type)) + : nullptr; + return dbg_info_->di_builder_->createPointerType(pointee_type, + ty_llvm->getPrimitiveSizeInBits()); } else { std::string type_str; llvm::raw_string_ostream rso(type_str); diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index df2a394b16eb7..18bf9d118478a 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -958,7 +958,9 @@ def test_llvm_target_attributes(): functions_with_target = [] for line in llvm_ir_lines: - func_def = re.match("define.* @(?P[^(]*)\(.* #(?P[0-9]+) {$", line) + func_def = re.match( + "define.* @(?P[^(]*)[(].* #(?P[0-9]+) (!.* |){$", line + ) if func_def: functions_with_target.append(func_def.group("func_name")) attributes_with_target[func_def.group("attr_num")] = True @@ -969,7 +971,7 @@ def test_llvm_target_attributes(): for k in list(attributes_with_target.keys()): assert re.match('.*"target-cpu"="skylake".*', attribute_definitions[k]) - assert re.match('.*"target-features"=".*\+avx512f.*".*', attribute_definitions[k]) + assert re.match('.*"target-features"=".*[+]avx512f.*".*', attribute_definitions[k]) expected_functions = ["test_func", "test_func_compute_", "__tvm_parallel_lambda"] for n in expected_functions: From 5ad27ef6506b5e50b82ee97f1a0a6aaa5fe0dbbf Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Wed, 13 Jul 2022 11:04:41 +0100 Subject: [PATCH 107/111] [CMSIS_NN] Align CMSIS-NN in TVM to TFLu SHA (#12030) * [CMSIS_NN] Align CMSIS-NN in TVM to TFLu SHA Change-Id: I7bb3b92196ad9f1a22eee87d704545e72b79ca0b * Updated CMSIS SHA to CMSIS TOT Change-Id: I0fec18e823478da991d49aa782f58f1c2f6212ba --- apps/microtvm/cmsisnn/Makefile | 39 ++++--------------- apps/microtvm/ethosu/Makefile | 30 ++++---------- docker/install/ubuntu_install_cmsis.sh | 4 +- .../backend/contrib/cmsisnn/tir_to_runtime.cc | 12 +++--- src/target/source/codegen_c_host.cc | 1 + tests/python/relay/aot/corstone300.mk | 16 ++++---- 6 files changed, 32 insertions(+), 70 deletions(-) diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile index cf7d375b7e54c..db72ab8896632 100644 --- a/apps/microtvm/cmsisnn/Makefile +++ b/apps/microtvm/cmsisnn/Makefile @@ -56,6 +56,7 @@ DEMO_MAIN = src/demo_bare_metal.c CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c) CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS)) CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c) +CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c) UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c) demo: $(BUILD_DIR)/demo @@ -81,42 +82,18 @@ ${BUILD_DIR}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS) $(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_startup.a) $(abspath $(BUILD_DIR))/libcmsis_startup/*.o $(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_startup.a) -CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha -ifneq ("$(wildcard $(CMSIS_SHA_FILE))","") -${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a: - $(QUIET)mkdir -p $(@D) - $(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS) - $(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all -else -# Build CMSIS-NN -${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a: - $(QUIET)mkdir -p $(@D) - $(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS) - $(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all -endif +# Build CMSIS-NN code +${BUILD_DIR}/libcmsis_nn.a: $(CMSIS_NN_SRCS) + $(QUIET)mkdir -p $(abspath $(BUILD_DIR)/libcmsis_nn) + $(QUIET)cd $(abspath $(BUILD_DIR)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^ + $(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_nn.a) $(abspath $(BUILD_DIR))/libcmsis_nn/*.o + $(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_nn.a) # Build demo application -ifneq ("$(wildcard $(CMSIS_SHA_FILE))","") -$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \ - ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a - $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS) -else $(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \ - ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a \ - ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a \ - ${BUILD_DIR}/cmsis_nn/Source/FullyConnectedFunctions/libCMSISNNFullyConnected.a \ - ${BUILD_DIR}/cmsis_nn/Source/SVDFunctions/libCMSISNNSVDF.a \ - ${BUILD_DIR}/cmsis_nn/Source/ReshapeFunctions/libCMSISNNReshape.a \ - ${BUILD_DIR}/cmsis_nn/Source/ActivationFunctions/libCMSISNNActivation.a \ - ${BUILD_DIR}/cmsis_nn/Source/NNSupportFunctions/libCMSISNNSupport.a \ - ${BUILD_DIR}/cmsis_nn/Source/ConcatenationFunctions/libCMSISNNConcatenation.a \ - ${BUILD_DIR}/cmsis_nn/Source/BasicMathFunctions/libCMSISNNBasicMaths.a \ - ${BUILD_DIR}/cmsis_nn/Source/ConvolutionFunctions/libCMSISNNConvolutions.a \ - ${BUILD_DIR}/cmsis_nn/Source/PoolingFunctions/libCMSISNNPooling.a + ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/libcmsis_nn.a $(QUIET)mkdir -p $(@D) $(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS) -endif clean: $(QUIET)rm -rf $(BUILD_DIR)/codegen diff --git a/apps/microtvm/ethosu/Makefile b/apps/microtvm/ethosu/Makefile index ccfa8c1af083a..1b79548eaf626 100644 --- a/apps/microtvm/ethosu/Makefile +++ b/apps/microtvm/ethosu/Makefile @@ -78,6 +78,7 @@ endif CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c) CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS)) CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c) +CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c) UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c) demo: $(BUILD_DIR)/demo @@ -109,33 +110,16 @@ ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a: $(QUIET)cd $(ETHOSU_DRIVER_PATH) && $(CMAKE) -B $(abspath $(BUILD_DIR)/ethosu_core_driver) $(DRIVER_CMAKE_FLAGS) $(QUIET)cd $(abspath $(BUILD_DIR)/ethosu_core_driver) && $(MAKE) - -CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha -ifneq ("$(wildcard $(CMSIS_SHA_FILE))","") -# Build CMSIS-NN -${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a: - $(QUIET)mkdir -p $(@D) - $(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS) - $(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all -else # Build CMSIS-NN Softmax -${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a: - $(QUIET)mkdir -p $(@D) - $(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS) - $(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) CMSISNNSoftmax -endif +${BUILD_DIR}/libcmsis_nn_softmax.a: $(CMSIS_NN_SOFTMAX_SRCS) + $(QUIET)mkdir -p $(abspath $(BUILD_DIR)/libcmsis_nn) + $(QUIET)cd $(abspath $(BUILD_DIR)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^ + $(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_nn_softmax.a) $(abspath $(BUILD_DIR))/libcmsis_nn/*.o + $(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_nn_softmax.a) - -# Build demo application -ifneq ("$(wildcard $(CMSIS_SHA_FILE))","") -$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a +$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/libcmsis_nn_softmax.a $(QUIET)mkdir -p $(@D) $(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS) -else -$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a - $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS) -endif clean: $(QUIET)rm -rf $(BUILD_DIR)/codegen diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh index 1fae6e57e006d..1116b5bd6929d 100755 --- a/docker/install/ubuntu_install_cmsis.sh +++ b/docker/install/ubuntu_install_cmsis.sh @@ -39,8 +39,8 @@ shift mkdir -p "${INSTALLATION_PATH}" # Download and extract CMSIS -CMSIS_SHA="977abe9849781a2e788b02282986480ff4e25ea6" -CMSIS_SHASUM="86c88d9341439fbb78664f11f3f25bc9fda3cd7de89359324019a4d87d169939eea85b7fdbfa6ad03aa428c6b515ef2f8cd52299ce1959a5444d4ac305f934cc" +CMSIS_SHA="e336766b1b5654f36244bca649917281f399bf37" +CMSIS_SHASUM="30c40824c4e008dcb9c6c77adee5115efa0cb04b6701fe2bc31ddf7be2da59f2161aeb4dbe5780cbaa709af23a3e21ea460bb2b84fa12418563125b4d426ac86" CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz" DOWNLOAD_PATH="/tmp/${CMSIS_SHA}.tar.gz" diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc index d5fb2ac97e838..50fa3821b7fa0 100644 --- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc +++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc @@ -342,7 +342,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { // Emit CMSIS-NN API PrintIndent(); - stream << "arm_status status = "; + stream << "arm_cmsis_nn_status status = "; stream << cmsis_func_name << "("; stream << "&" << context << ", "; stream << "&" << conv_params << ", "; @@ -352,7 +352,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { stream << "&" << bias_dim << ", " << bias_data << ", "; stream << "&" << output_dim << ", " << output_data << ");\n"; PrintIndent(); - stream << "if (status != ARM_MATH_SUCCESS) {\n"; + stream << "if (status != ARM_CMSIS_NN_SUCCESS) {\n"; PrintIndent(); PrintIndent(); stream << "return -1;\n"; @@ -411,7 +411,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { std::string output_dim = EmitCMSISNNDims(stream, "output", output_dims); PrintIndent(); - stream << "arm_status status = "; + stream << "arm_cmsis_nn_status status = "; stream << cmsis_func_name << "("; stream << "&" << context << ", "; stream << "&" << cmsisnn_fc_params << ", "; @@ -421,7 +421,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { stream << "&" << bias_dim << ", " << bias_data << ", "; stream << "&" << output_dim << ", " << output_data << ");\n"; PrintIndent(); - stream << "if (status != ARM_MATH_SUCCESS) {\n"; + stream << "if (status != ARM_CMSIS_NN_SUCCESS) {\n"; PrintIndent(); PrintIndent(); stream << "return -1;\n"; @@ -467,7 +467,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { std::string output_dim = EmitCMSISNNDims(stream, "output", output_dims); PrintIndent(); - stream << "arm_status status = "; + stream << "arm_cmsis_nn_status status = "; stream << cmsis_func_name << "("; stream << "&" << context << ", "; stream << "&" << cmsisnn_pool_params << ", "; @@ -475,7 +475,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost { stream << "&" << filter_dim << ", "; stream << "&" << output_dim << ", " << output_data << ");\n"; PrintIndent(); - stream << "if (status != ARM_MATH_SUCCESS) {\n"; + stream << "if (status != ARM_CMSIS_NN_SUCCESS) {\n"; PrintIndent(); PrintIndent(); stream << "return -1;\n"; diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc index 67106ff07f7ee..54975d166ea2f 100644 --- a/src/target/source/codegen_c_host.cc +++ b/src/target/source/codegen_c_host.cc @@ -62,6 +62,7 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s decl_stream << "#include \n"; decl_stream << "#include \n"; decl_stream << "#include \n"; + decl_stream << "#include \n"; } CodeGenC::Init(output_ssa); } diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk index 7f95c0af2b41f..374e2008f42bc 100644 --- a/tests/python/relay/aot/corstone300.mk +++ b/tests/python/relay/aot/corstone300.mk @@ -74,15 +74,9 @@ CC_CODEGEN_SRCS = $(shell find $(abspath $(CODEGEN_ROOT)/host/src/*.cc)) C_CODEGEN_OBJS = $(subst .c,.o,$(C_CODEGEN_SRCS)) CC_CODEGEN_OBJS = $(subst .cc,.o,$(CC_CODEGEN_SRCS)) CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c) +CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c) UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c) -CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha -ifneq ("$(wildcard $(CMSIS_SHA_FILE))","") - CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/libcmsis-nn.a) -else - CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/*/*.a) -endif - ifdef ETHOSU_TEST_ROOT ETHOSU_DRIVER_LIBS = $(wildcard ${DRIVER_PATH}/build/*.a) ETHOSU_RUNTIME=$(build_dir)/tvm_ethosu_runtime.o @@ -114,13 +108,19 @@ ${build_dir}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS) $(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_startup.a) $(abspath $(build_dir))/libcmsis_startup/*.o $(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_startup.a) +${build_dir}/libcmsis_nn.a: $(CMSIS_NN_SRCS) + $(QUIET)mkdir -p $(abspath $(build_dir)/libcmsis_nn) + $(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^ + $(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_nn.a) $(abspath $(build_dir))/libcmsis_nn/*.o + $(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_nn.a) + ${build_dir}/libuart.a: $(UART_SRCS) $(QUIET)mkdir -p $(abspath $(build_dir)/libuart) $(QUIET)cd $(abspath $(build_dir)/libuart) && $(CC) -c $(PKG_CFLAGS) $^ $(QUIET)$(AR) -cr $(abspath $(build_dir)/libuart.a) $(abspath $(build_dir))/libuart/*.o $(QUIET)$(RANLIB) $(abspath $(build_dir)/libuart.a) -$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o ${build_dir}/libcmsis_startup.a ${build_dir}/libuart.a $(build_dir)/libcodegen.a $(CMSIS_NN_LIBS) $(ETHOSU_DRIVER_LIBS) $(ETHOSU_RUNTIME) +$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o $(build_dir)/libcodegen.a ${build_dir}/libcmsis_startup.a ${build_dir}/libcmsis_nn.a ${build_dir}/libuart.a $(ETHOSU_DRIVER_LIBS) $(ETHOSU_RUNTIME) $(QUIET)mkdir -p $(@D) $(QUIET)$(CC) $(PKG_CFLAGS) $(ETHOSU_INCLUDE) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS) From c30b420f61295fb60530dd01e84f8988605d72a5 Mon Sep 17 00:00:00 2001 From: masahi Date: Wed, 13 Jul 2022 21:08:05 +0900 Subject: [PATCH 108/111] [TOPI, x86] Properly handle fused ops in TE softmax schedule (#12015) * fix x86 softmax fusion * properly handle the case where softmax and fuseed op having different layout * add test --- python/tvm/topi/x86/nn.py | 50 ++++++++++++------- tests/python/frontend/pytorch/test_forward.py | 40 +++++++++++++++ 2 files changed, 71 insertions(+), 19 deletions(-) diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py index 5475fc772e77c..5fd1108811158 100644 --- a/python/tvm/topi/x86/nn.py +++ b/python/tvm/topi/x86/nn.py @@ -18,6 +18,7 @@ """x86 nn operators""" from tvm import te from ..utils import traverse_inline +from .injective import schedule_injective_from_existing def _schedule_softmax(softmax_op, s, outs): @@ -48,28 +49,39 @@ def _schedule_softmax(softmax_op, s, outs): ) ) - # only parallelize outer dimensions up to axis - outer_axes = [s[softmax_op].op.axis[i] for i in range(0, axis)] - fused_outer_axes = s[softmax_op].fuse(*outer_axes) - s[softmax_op].parallel(fused_outer_axes) + output = outs[0] - # move computations with the same outer dimensions under the same root - s[max_elem].compute_at(s[softmax_op], fused_outer_axes) - s[expsum].compute_at(s[softmax_op], fused_outer_axes) + def _schedule(output_op, softmax_op): + # only parallelize outer dimensions up to axis + outer_axes = [output_op.axis[i] for i in range(0, axis)] + fused_outer_axes = s[output_op].fuse(*outer_axes) + s[output_op].parallel(fused_outer_axes) - if delta is not None: - s[exp].compute_inline() - s[delta].compute_inline() - if exp is not None: - s[exp].compute_at(s[softmax_op], fused_outer_axes) + if softmax_op != output_op: + # fuse softmax output with following elemwise ops. + s[softmax_op].compute_at(s[output_op], fused_outer_axes) - if softmax_op != outs[0].op: - # fuse softmax output with following elemwise ops. - output = outs[0] - outer_axes = [s[output].op.axis[i] for i in range(0, axis)] - fused_outer_axes = s[output].fuse(*outer_axes) - s[output].parallel(fused_outer_axes) - s[softmax_op].compute_at(s[output], fused_outer_axes) + # move computations with the same outer dimensions under the same root + s[max_elem].compute_at(s[output_op], fused_outer_axes) + s[expsum].compute_at(s[output_op], fused_outer_axes) + + if delta is not None: + s[exp].compute_inline() + s[delta].compute_inline() + if exp is not None: + s[exp].compute_at(s[output_op], fused_outer_axes) + + if list(output.shape) == list(softmax_op.output(0).shape): + _schedule(output.op, softmax_op) + else: + # This case can happen, for example, if the 4D input to softmax + # is in the NCHW layout while the fused elemwise op takes the NCHWc layout. + # Since we parallelize over outer axes up to the "axis" parameter of softmax, + # softmax and the fused op need to be in the same layout if we want to + # fuse them under the same parallel loop. + # This case can be removed if softmax supported AlterLayout. + schedule_injective_from_existing(s, output) + _schedule(softmax_op, softmax_op) def schedule_softmax(outs): diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 30ba713396572..cd7c50d486866 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -4544,5 +4544,45 @@ def test_remainder(x, y): verify_model(test_fn, [torch.tensor([1, 2, 3, 4, 5]), torch.tensor(-1.5)]) +def test_softmax_fuse(): + # https://github.com/apache/tvm/issues/12001 + class Model(torch.nn.Module): + def __init__(self, nchwc_post_op=False) -> None: + super().__init__() + self.conv = torch.nn.Conv2d(3, 3, (1, 1), 1) + self.nchwc_post_op = nchwc_post_op + + @torch.no_grad() + def forward(self, x): + t0a = self.conv(x) + t0b = torch.floor(x) + t2b = torch.softmax(t0a, dim=2) + + if self.nchwc_post_op: + t3a = t0a - t0b + t4a = t2b - t0b + t6a = t3a + t4a + return t6a + + return t2b + 1 + + sh = [3, 3, 10, 1] + inp = torch.ones(*sh, dtype=torch.float32) + + for model in [Model(nchwc_post_op=False).eval(), Model(nchwc_post_op=True).eval()]: + output_torch = model(inp).numpy() + + mod, params = relay.frontend.from_pytorch(torch.jit.trace(model, inp), [("inp0", sh)]) + + with tvm.transform.PassContext(opt_level=4): + out = ( + relay.create_executor("graph", mod, params=params) + .evaluate()(inp0=inp.numpy()) + .numpy() + ) + + tvm.testing.assert_allclose(out, output_torch, rtol=1e-5, atol=1e-5) + + if __name__ == "__main__": pytest.main([__file__]) From 6a6093bc180ed762b3e0d19eb37fcf10d97289c1 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Wed, 13 Jul 2022 22:52:35 +0800 Subject: [PATCH 109/111] fold const or empty iter partition (#12080) --- src/tir/transforms/loop_partition.cc | 30 +++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc index 59ac339006f41..677506889e57c 100644 --- a/src/tir/transforms/loop_partition.cc +++ b/src/tir/transforms/loop_partition.cc @@ -587,16 +587,17 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim if (middle_interval_i->HasLowerBound()) { body_begin = analyzer_.Simplify(middle_interval.min()); if (!analyzer_.CanProve(body_begin == min)) { - PrimExpr cond = (body_begin - min >= 0); - if (!analyzer_.CanProve(cond)) { - LOG(WARNING) << "Cannot prove: " << cond << ", when generating the pre doubt loop"; - body_begin = Max(body_begin, min); + PrimExpr extent = analyzer_.Simplify(body_begin - min); + if (!analyzer_.CanProve(extent > 0)) { + body_begin = tvm::max(body_begin, min); // stop recursing on this interval if we can't prove it has non-negative length pre_stmt_recurse = false; } - if (!partition_thread_scope) { - Stmt pre_body = Substitute(body, {{Var{var}, var + min}}); - pre_stmt = MakeFor(stmt.get(), body_begin - min, pre_body); + if (!analyzer_.CanProve(extent <= 0)) { + if (!partition_thread_scope) { + Stmt pre_body = Substitute(body, {{Var{var}, var + min}}); + pre_stmt = MakeFor(stmt.get(), body_begin - min, pre_body); + } } } } else { @@ -612,16 +613,17 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim post_doubt_begin = analyzer_.Simplify(middle_interval.max() + 1); if (!analyzer_.CanProve(middle_interval.max() == max)) { // require the extent to be non-negative - PrimExpr cond = (max - post_doubt_begin + 1 >= 0); - if (!analyzer_.CanProve(cond)) { - LOG(WARNING) << "Cannot prove: " << cond << ", when generating the post doubt loop"; - post_doubt_begin = Min(post_doubt_begin, max + 1); + PrimExpr extent = analyzer_.Simplify(max - post_doubt_begin + 1); + if (!analyzer_.CanProve(extent > 0)) { + post_doubt_begin = tvm::min(post_doubt_begin, max + 1); // stop recursing on this interval if we can't prove it has non-negative length post_stmt_recurse = false; } - if (!partition_thread_scope) { - Stmt post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}}); - post_stmt = MakeFor(stmt.get(), max - post_doubt_begin + 1, post_body); + if (!analyzer_.CanProve(extent <= 0)) { + if (!partition_thread_scope) { + Stmt post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}}); + post_stmt = MakeFor(stmt.get(), extent, post_body); + } } } } else { From 7d9a07ccc70eef951bcfff0333c2f82cdc6a3b12 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Wed, 13 Jul 2022 08:57:01 -0700 Subject: [PATCH 110/111] [TIR][Schedule] Refactor Tensorize (#12070) * Refactor blockize * Refactor tensorize * Address review comments * typo * rename variables according to review --- .../schedule/primitive/blockize_tensorize.cc | 853 ++++++++---------- .../unittest/test_tir_schedule_blockize.py | 322 ++++--- 2 files changed, 580 insertions(+), 595 deletions(-) diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc index 4ede2dd90da80..9c3029ebf5137 100644 --- a/src/tir/schedule/primitive/blockize_tensorize.cc +++ b/src/tir/schedule/primitive/blockize_tensorize.cc @@ -24,6 +24,20 @@ namespace tvm { namespace tir { +template +bool UsesVar(const T& x, const Var& var) { + return UsesVar(x, [tgt = var.get()](const VarNode* v) { return v == tgt; }); +} + +Range RangeFromExtent(const PrimExpr& extent) { + return Range::FromMinExtent(make_zero(extent->dtype), extent); +} + +template +T DeepCopy(const T& stmt) { + return Downcast(LoadJSON(SaveJSON(stmt))); +} + /*! * \brief ScheduleError that the bindings of the inner block are not divisible by the subspace * represented by the outer loops. @@ -64,16 +78,16 @@ class SubspaceNotDivisibleError : public ScheduleError { * * \param iter_vars The input iterators * \param bindings The values of iter_vars - * \param outer_loops Iterators outside the subspace. - * \param inner_loops Iterators of the subspace * \param predicate The predicate constraint on the input iterators. + * \param outer_iters The iters of the outer space + * \param inner_iters The iters of the inner space * \return The result of the subspace division. */ Array> TrivialSubspaceDivision(const Array& iter_vars, const Array& bindings, + const PrimExpr& predicate, const Array& outer_iters, - const Array& inner_iters, - const PrimExpr& predicate) { + const Array& inner_iters) { if (!is_one(predicate)) return {}; Array> res; std::unordered_set outer_loop_vars; @@ -95,7 +109,7 @@ Array> TrivialSubspaceDivision(const Array& iter auto use_inner_loop_vars = make_uses_var(inner_iters); arith::IterMark unit_iter_mark(arith::IterSumExpr({}, 0), 1); - for (size_t i = 0; i < bindings.size(); ++i) { + for (int i = 0, n = bindings.size(); i < n; ++i) { bool outer = use_outer_loop_vars(bindings[i]); bool inner = use_inner_loop_vars(bindings[i]); arith::IterMark iter_mark; @@ -122,531 +136,462 @@ Array> TrivialSubspaceDivision(const Array& iter } /*! - * \brief Generate the blockized init block. - * \param block The original block with init. - * \param inner_block_realize The block realize of the inner block after blockize. - * \param inner_loops The inner loops after blockize. - * \return The subtree of the init block and its outer loops. + * \brief Subspace division. The space is divided into two subspaces: + * 1. The subspace represented by the outer loops above `loop_sref` (exclusive). + * 2. The subspace represented by the inner loops below `loop_sref` (inclusive). + * \param realize The inner block + * \param block_sref The sref to the inner block + * \param loop_sref The loop that is the root of the second subspace. + * \param loops The loops that represents the second part of the subspace. + * \param analyzer The arithmetic analyzer to use. */ -Stmt GenerateBlockizedInit(const Block& block, const BlockRealize& inner_block_realize, - const std::vector& inner_loops) { - Array init_block_iters; - Array init_bindings; - const Block& inner_block = inner_block_realize->block; - - // Step 1: Collect data-parallel block iters - for (size_t i = 0; i < inner_block->iter_vars.size(); i++) { - const IterVar& iter_var = inner_block->iter_vars[i]; - const PrimExpr& binding = inner_block_realize->iter_values[i]; - if (iter_var->iter_type == IterVarType::kDataPar && - UsesVar(block->init.value(), - [tgt_var = iter_var->var.get()](const VarNode* var) { return var == tgt_var; })) { - init_block_iters.push_back(iter_var); - init_bindings.push_back(binding); +Array> SubspaceDivide(const BlockRealize& realize, + const StmtSRef& block_sref, // + const StmtSRef& loop_sref, // + std::vector* loops, + arith::Analyzer* analyzer) { + Array inner_vars; + Array outer_vars; + Map loop_var_domain; + bool inner = true; + for (StmtSRefNode* sref = block_sref->parent; // + sref && sref->stmt->IsInstance(); // + sref = sref->parent) { + const ForNode* loop = static_cast(sref->stmt); + if (inner) { + loops->push_back(loop); + inner_vars.push_back(loop->loop_var); + } else { + outer_vars.push_back(loop->loop_var); } - } - - // Step 2: Collect loops related to iters of the init block - std::vector init_loops; - for (const ForNode* inner_loop : inner_loops) { - for (const PrimExpr& init_binding : init_bindings) { - if (UsesVar(init_binding, [tgt_var = inner_loop->loop_var.get()](const VarNode* var) { - return var == tgt_var; - })) { - init_loops.push_back(inner_loop); - break; - } + loop_var_domain.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent)); + if (sref == loop_sref.get()) { + inner = false; } } - - // Step 3: Create new block iters for the init block - Map subst_map; - for (size_t i = 0; i < init_block_iters.size(); i++) { - IterVar new_iter_var = init_block_iters[i]; - Var old_var = new_iter_var->var; - Var new_var = old_var.copy_with_suffix("_init"); - new_iter_var.CopyOnWrite()->var = new_var; - subst_map.Set(old_var, new_var); - init_block_iters.Set(i, std::move(new_iter_var)); - } - - // Step 4: Generate loop nests and the init block - Stmt new_init = BlockRealize( - /*iter_values=*/init_bindings, - /*predicate=*/inner_block_realize->predicate, - /*block=*/ - Block{/*iter_vars=*/init_block_iters, - /*reads=*/{}, - /*writes=*/block->writes, - /*name_hint=*/block->name_hint + "_init", - /*body=*/block->init.value(), - /*init=*/NullOpt}); - - // Step 5: Generate the parent loops for the init block - for (const ForNode* init_loop : init_loops) { - ObjectPtr new_loop = make_object(*init_loop); - new_loop->loop_var = init_loop->loop_var.copy_with_suffix(""); - subst_map.Set(init_loop->loop_var, new_loop->loop_var); - new_loop->body = std::move(new_init); - new_init = For(new_loop); + Array> result = + arith::SubspaceDivide(realize->iter_values, loop_var_domain, inner_vars, realize->predicate, + arith::IterMapLevel::Surjective, analyzer); + if (!result.empty()) { + return result; } - - // Step 6: Substitute with new loop variables and block iters to prevent duplication of - // variables in the outer block. - new_init = Substitute(new_init, subst_map); - - return new_init; + return TrivialSubspaceDivision(realize->block->iter_vars, + realize->iter_values, // + realize->predicate, // + outer_vars, inner_vars); } /*! - * \brief A helper to collect the parent loops of the block. The loops are divided into two groups, - * 'outer_loops', and 'inner_loops', by a specified loop as the separator. 'outer_loops' are the - * ancestor loops of the separator loop. 'inner_loops' include the separator loop itself, and its - * successor loops. It is possible that 'outer_loops' is empty. + * \brief Derive the block bindings for both inner and outer block + * \param iter_vars The original block iterators to the inner block + * \param division The subspace division. + * \param outer_iter_vars The outer block iterators. + * \param outer_bindings The outer block bindings. + * \param inner_iter_vars The inner block iterators. + * \param inner_bindings The inner block bindings. + * \return A substitution plan to the iterators in the original inner block. */ -class LoopSubspaceCollector { - public: - /*! - * \brief Collect the parent loops of the block and store the result in the corresponding fields. - * \param block_sref The sref to the target block. - * \param loop_sref The sref to the separator loop. The loop itself is counted as an inner loop. - */ - void Collect(const StmtSRef& block_sref, const StmtSRef& loop_sref) { - bool inner = true; - for (StmtSRefNode* current_sref = block_sref->parent; - current_sref && current_sref->stmt->IsInstance(); - current_sref = current_sref->parent) { - const auto* current_loop = current_sref->StmtAs(); - ICHECK(current_loop); - if (inner) { - inner_loops.push_back(current_loop); - inner_loop_vars.push_back(current_loop->loop_var); - } else { - outer_loops.push_back(current_loop); - outer_loop_vars.push_back(current_loop->loop_var); - } - loop_var_domain.Set(current_loop->loop_var, - Range::FromMinExtent(current_loop->min, current_loop->extent)); - if (current_sref == loop_sref.get()) inner = false; +Map DeriveBlockBinding(const Array& iter_vars, // + const Array>& division, // + Array* outer_iter_vars, // + Array* outer_bindings, // + Array* inner_iter_vars, // + Array* inner_bindings) { + using arith::IterMapExpr; + using arith::IterMapExprNode; + using arith::NormalizeIterMapToExpr; + Map block_var_subst; + ICHECK_EQ(iter_vars.size() + 1, division.size()); + for (int i = 0, n = iter_vars.size(); i < n; ++i) { + const IterVar& iter_var = iter_vars[i]; + arith::IterMark outer_mark = division[i][0]; + arith::IterMark inner_mark = division[i][1]; + IterMapExpr outer_binding = Downcast(outer_mark->source); + IterMapExpr inner_binding = Downcast(inner_mark->source); + // After computing the subspace division, bindings[i] can be written as + // outer_binding * inner_binding->extent + inner_binding + // The outer block will have binding: iter_outer -> outer_binding + // The inner block will have binding: iter_inner -> inner_binding + // The iter in the original block will be substituted with base + iter_inner where + // base == iter_outer * iter_inner_extent + if (is_one(inner_mark->extent)) { // IsOuter + // extract this iter var to outer block directly + outer_bindings->push_back(NormalizeIterMapToExpr(outer_binding)); + outer_iter_vars->push_back(iter_var); + continue; } + // create iter var for the outer block + IterVar outer_iter(/*dom=*/RangeFromExtent(outer_mark->extent), + /*var=*/iter_var->var.copy_with_suffix("_o"), + /*iter_type=*/iter_var->iter_type); + outer_bindings->push_back(NormalizeIterMapToExpr(outer_binding)); + outer_iter_vars->push_back(outer_iter); + // create iter var for the inner block + IterVar inner_iter(/*dom=*/RangeFromExtent(inner_mark->extent), + /*var=*/iter_var->var.copy_with_suffix("_i"), + /*iter_type=*/iter_var->iter_type); + inner_bindings->push_back(NormalizeIterMapToExpr(inner_binding)); + inner_iter_vars->push_back(inner_iter); + // substitution + PrimExpr sub{nullptr}; + if (is_one(outer_mark->extent)) { + sub = inner_iter->var; + } else { + sub = outer_iter * inner_mark->extent + inner_iter->var; + } + block_var_subst.Set(iter_var->var, sub); } - /*! \brief Outer loops which are ancestors of the separator. */ - std::vector outer_loops; - /*! \brief Inner loops which are the separator itself or its successors. */ - std::vector inner_loops; - /*! \brief Loop variables of the outer loops. */ - Array outer_loop_vars; - /*! \brief Loop variables of the inner loops. */ - Array inner_loop_vars; - /*! \brief Domain of the loop variables. */ - Map loop_var_domain; -}; + return block_var_subst; +} /*! - * \brief Check the bindings of the block iters can be divided by a subspace collected by the - * collector. - * \param mod The current IR module. - * \param block_realize The block realize to be checked. - * \param collector The collector which has collected the loops of the block. - * \param analyzer The arithmetic analyzer. - * \return The result of the subspace division. - * \throws ScheduleError If the bindings are not divisible by the subspace. + * \brief Generate the inner block for blockization + * \param is_write_reduction Whether the write regions of the inner block are actually reduction. + * \param iter_vars IterVars used in the inner block. + * \param iter_values IterVar bindings used in the inner block. + * \param predicate The predicate of the inner block. + * \param block The inner block as a template to be created from. This method will modify its + * `iter_vars`, `init` and `reads` fields. + * \return The inner block created. */ -Array> CheckSubspaceDivisible(const IRModule& mod, - const BlockRealize& block_realize, - const LoopSubspaceCollector& collector, - arith::Analyzer* analyzer) { - const Block& block = block_realize->block; - - Array> division = arith::SubspaceDivide( - block_realize->iter_values, collector.loop_var_domain, collector.inner_loop_vars, - block_realize->predicate, arith::IterMapLevel::Surjective, analyzer); - - if (division.empty()) { - // If we can't do perfect subspace division, check if it is a trivial case of subspace division. - // In this case, we can still blockize. - division = TrivialSubspaceDivision(block->iter_vars, block_realize->iter_values, - collector.outer_loop_vars, collector.inner_loop_vars, - block_realize->predicate); - } - if (division.empty()) { - throw SubspaceNotDivisibleError(mod, GetRef(collector.inner_loops.back()), block); +BlockRealize GenerateInner(bool is_write_reduction, + const Array& iter_vars, // + const Array& iter_values, // + const PrimExpr& predicate, // + Block block) { + BlockNode* n = block.CopyOnWrite(); + n->iter_vars = iter_vars; + n->init = NullOpt; + if (is_write_reduction) { + Array reads; + reads.reserve(block->writes.size() + block->reads.size()); + reads.insert(reads.end(), block->writes.begin(), block->writes.end()); + reads.insert(reads.end(), block->reads.begin(), block->reads.end()); + n->reads = std::move(reads); } - return division; + return BlockRealize(/*iter_values=*/iter_values, /*predicate=*/predicate, + /*block=*/block); } /*! - * \brief The binding extractor to compute the bindings of the outer and the inner blocks after - * blockize. + * \brief Generate the init stmt for the outer block + * \param block The original block with init. + * \param inner_realize The block realize of the inner block after blockize. + * \param loops The inner loops after blockize. + * \return The subtree of the init block and its outer loops. */ -class BlockizedBindingExtractor { - public: - /*! - * \brief Extract bindings for blockize. - * \param iter_vars The iter vars of the original inner block. - * \param division The result of the subspace division. - */ - void ExtractBindings(const Array& iter_vars, - const Array>& division, arith::Analyzer* analyzer) { - ICHECK_EQ(iter_vars.size() + 1, division.size()); - for (size_t i = 0; i < iter_vars.size(); ++i) { - const IterVar& iter_var = iter_vars[i]; - arith::IterMark outer_mark = division[i][0]; - arith::IterMark inner_mark = division[i][1]; - const auto* outer_binding = - TVM_TYPE_AS(outer_binding, outer_mark->source, arith::IterMapExprNode); - const auto* inner_binding = - TVM_TYPE_AS(inner_binding, inner_mark->source, arith::IterMapExprNode); - - // After computing the subspace division, bindings[i] can be written as - // outer_binding * inner_binding->extent + inner_binding - // The outer block will have binding: iter_outer -> outer_binding - // The inner block will have binding: iter_inner -> inner_binding - // The iter in the original block will be substituted with base + iter_inner where - // base == iter_outer * iter_inner_extent - - if (is_one(division[i][1]->extent)) { // IsOuter - // extract this iter var to outer block directly - outer_bindings.push_back( - arith::NormalizeIterMapToExpr(GetRef(outer_binding))); - outer_iter_vars.push_back(iter_var); - } else { - // create iter var for the outer block - const IterVar outer_var(/*dom=*/Range::FromMinExtent(0, division[i][0]->extent), - /*var=*/iter_var->var.copy_with_suffix("_o"), - /*iter_type=*/iter_var->iter_type); - outer_bindings.push_back( - arith::NormalizeIterMapToExpr(GetRef(outer_binding))); - outer_iter_vars.push_back(outer_var); - PrimExpr base = is_one(division[i][0]->extent) ? 0 : outer_var * division[i][1]->extent; - // create iter var for the inner block - IterVar new_iter(Range::FromMinExtent(0, division[i][1]->extent), Var(iter_var->var), - iter_var->iter_type, iter_var->thread_tag, iter_var->span); - inner_iter_dom_map.Set(new_iter->var, arith::IntSet::FromRange(new_iter->dom)); - analyzer->Bind(new_iter->var, new_iter->dom); - inner_iter_vars.push_back(new_iter); - inner_bindings.push_back( - arith::NormalizeIterMapToExpr(GetRef(inner_binding))); - inner_iter_subst_map.Set(iter_var->var, base + new_iter->var); +Stmt GenerateOuterInit(const Stmt& block_init, const BlockRealize& inner_realize, + const std::vector& loops, String block_name) { + const Block& inner_block = inner_realize->block; + Map subst_map; + // Step 1: Create new block vars for the block inside the init stmt of outer block + // A iter is used in the block if + // 1) It is data parallel + // 2) It is used in the original init block + Array iter_vars; + Array iter_values; + ICHECK_EQ(inner_block->iter_vars.size(), inner_realize->iter_values.size()); + int n = inner_block->iter_vars.size(); + iter_vars.reserve(n); + iter_values.reserve(n); + for (int i = 0; i < n; ++i) { + const IterVar& old_iter_var = inner_block->iter_vars[i]; + const PrimExpr& iter_value = inner_realize->iter_values[i]; + if (old_iter_var->iter_type == IterVarType::kDataPar && + UsesVar(block_init, old_iter_var->var)) { + ObjectPtr new_iter_var = make_object(*old_iter_var.get()); + new_iter_var->var = new_iter_var->var.copy_with_suffix("_init"); + subst_map.Set(old_iter_var->var, new_iter_var->var); + iter_vars.push_back(IterVar(new_iter_var)); + iter_values.push_back(iter_value); + } + } + // Step 2: Generate the block inside init stmt of outer block + Stmt stmt = BlockRealize( + /*iter_values=*/iter_values, + /*predicate=*/inner_realize->predicate, + /*block=*/ + Block(/*iter_vars=*/iter_vars, + /*reads=*/{}, + /*writes=*/inner_block->writes, + /*name_hint=*/block_name, + /*body=*/block_init, + /*init=*/NullOpt)); + // Step 3. Create the loop nest on top of the block + for (const ForNode* loop : loops) { + bool is_init_loop = false; + for (const PrimExpr& init_binding : iter_values) { + if (UsesVar(init_binding, loop->loop_var)) { + is_init_loop = true; + break; } } + if (is_init_loop) { + ObjectPtr new_loop = make_object(*loop); + new_loop->loop_var = loop->loop_var.copy_with_suffix(""); + new_loop->body = std::move(stmt); + subst_map.Set(loop->loop_var, new_loop->loop_var); + stmt = For(new_loop); + } } - Map inner_iter_subst_map; - /*! \brief Iters of the outer block. */ - Array outer_iter_vars; - /*! \brief Iters of the outer block. */ - Array inner_iter_vars; - /*! \brief Binding values of the outer block. */ - Array outer_bindings; - /*! \brief Binding values of the inner block. */ - Array inner_bindings; - /*! \brief The domain of the inner block iters. */ - Map inner_iter_dom_map; -}; + // Step 4: Substitute the iter vars and loop vars + return Substitute(stmt, subst_map); +} /*! - * \brief Replacer for the inner block after blockize. Inner block iters will be replaced with - * base + inner_iter and the expressions after substituion will be simplified if possible. + * \brief Substitute variables in the stmt, do simplification and track block substitution + * \param stmt The stmt to be substituted. + * \param sub The substitution map. + * \param block_sref_reuse The block substitution happens during the substitution. + * \param analyzer The analyzer for arithmetic simplification. + * \return The substituted stmt. */ -class InnerIterReplacer : public StmtExprMutator { - public: - /*! - * \brief The constructor - * \param subst_map The substitution map of the inner block iters. - * \param analyzer The arithmetic analyzer. - * \param block_sref_reuse The map to save the block reuse information. - */ - InnerIterReplacer(Map subst_map, arith::Analyzer* analyzer, - Map* block_sref_reuse) - : subst_map_(std::move(subst_map)), - analyzer_(analyzer), - block_sref_reuse_(block_sref_reuse) {} - - PrimExpr VisitExpr_(const VarNode* op) final { - auto it = subst_map_.find(GetRef(op)); - if (it != subst_map_.end()) { - return (*it).second; +Stmt Substitute(const Stmt& stmt, const Map& sub, + Map* block_sref_reuse, arith::Analyzer* analyzer) { + struct Replacer : public StmtExprMutator { + explicit Replacer(const Map& sub, Map* block_sref_reuse, + arith::Analyzer* analyzer) + : sub_(sub), block_sref_reuse_(block_sref_reuse), analyzer_(analyzer) {} + + PrimExpr VisitExpr(const PrimExpr& op) final { + PrimExpr result = StmtExprMutator::VisitExpr(op); + if (!result.same_as(op)) { + return analyzer_->Simplify(result); + } + return result; } - return StmtExprMutator::VisitExpr_(op); - } - PrimExpr VisitExpr(const PrimExpr& op) final { - PrimExpr result = StmtExprMutator::VisitExpr(op); - if (!result.same_as(op)) { - return analyzer_->Simplify(result); + PrimExpr VisitExpr_(const VarNode* op) final { + if (Optional e = sub_.Get(GetRef(op))) { + return e.value(); + } + return StmtExprMutator::VisitExpr_(op); } - return result; - } - Stmt VisitStmt_(const BlockNode* op) final { - Stmt result = StmtExprMutator::VisitStmt_(op); - if (!result.same_as(GetRef(op))) { - block_sref_reuse_->Set(GetRef(op), Downcast(result)); + Stmt VisitStmt_(const BlockNode* op) final { + Block src = GetRef(op); + Block tgt = Downcast(StmtExprMutator::VisitStmt_(op)); + if (!src.same_as(tgt)) { + block_sref_reuse_->Set(src, tgt); + } + return tgt; } - return result; - } - private: - Map subst_map_; - arith::Analyzer* analyzer_; - Map* block_sref_reuse_; -}; + const Map& sub_; + Map* block_sref_reuse_; + arith::Analyzer* analyzer_; + }; + return Replacer(sub, block_sref_reuse, analyzer)(stmt); +} /*! - * \brief Compute the access region of the outer block by relaxing the inner loops. - * \param buffer_region The original buffer region. - * \param The range of the inner loops. - * \return The new buffer region. + * \brief Relax the variables for the given regions + * \param regions The regions to be relaxed. + * \param dom_map The variables to be relaxed + * \return The relaxed regions */ -BufferRegion RelaxBlockizedInnerIters(const BufferRegion& buffer_region, - const Map& inner_iter_relaxed_range) { - Array new_region; - new_region.reserve(buffer_region->region.size()); - Array relaxed_int_set = - arith::EvalSet(buffer_region->region, inner_iter_relaxed_range); - ICHECK(buffer_region->region.size() == buffer_region->buffer->shape.size()); - for (size_t i = 0; i < buffer_region->region.size(); i++) { - Range max_range = Range::FromMinExtent(0, buffer_region->buffer->shape[i]); - new_region.push_back(relaxed_int_set[i].CoverRange(max_range)); +Array EvalSetRegions(const Array& regions, + const Map& dom_map) { + Array results; + results.reserve(regions.size()); + for (const BufferRegion& buffer_region : regions) { + const Buffer& buffer = buffer_region->buffer; + Array relaxed = arith::EvalSet(buffer_region->region, dom_map); + ICHECK_EQ(relaxed.size(), buffer->shape.size()); + int ndim = buffer->shape.size(); + Array new_region; + new_region.reserve(ndim); + for (int i = 0; i < ndim; ++i) { + new_region.push_back(relaxed[i].CoverRange(RangeFromExtent(buffer->shape[i]))); + } + results.push_back(BufferRegion(buffer, new_region)); } - return BufferRegion(buffer_region->buffer, std::move(new_region)); + return results; } /*! - * \brief Generate the outer block after blockize. - * \param extractor The binding extractor which has extracted the blockized bindings. - * \param block The original inner block. - * \param inner_block_realize The block realize of the inner block after blockize. - * \param inner_loops The inner loops after blockize. - * \param predicate The outer predicate of the subspace division. - * \return The block realize of the outer block after blockize. + * \brief Create the loop nest on top of the given stmt. + * \param stmt The stmt to be wrapped. + * \param loops The loop nests + * \return The wrapped stmt. */ -BlockRealize GenerateBlockizedOuterBlock(const BlockizedBindingExtractor& extractor, - const Block& block, BlockRealize inner_block_realize, - const std::vector& inner_loops, - PrimExpr predicate) { - // Step 1: Generate the init block if needed - Optional new_init = NullOpt; - if (block->init.defined()) { - new_init = GenerateBlockizedInit(block, inner_block_realize, inner_loops); - } - - // Step 2: Compute the access regions of the outer block by relaxing the inner loops - Array new_reads = block->reads; - Array new_writes = block->writes; - - auto f_mutate = [&](const BufferRegion& buffer_region) { - return RelaxBlockizedInnerIters(buffer_region, extractor.inner_iter_dom_map); - }; - new_reads.MutateByApply(f_mutate); - new_writes.MutateByApply(f_mutate); - - // Step 3: Generate the body of the outer block. The body of the outer block is the inner block - // realize and its surrounding loops. - Stmt outer_block_body = inner_block_realize; - for (const ForNode* loop : inner_loops) { +Stmt MakeLoopNest(Stmt stmt, const std::vector& loops) { + for (const ForNode* loop : loops) { ObjectPtr new_loop = make_object(*loop); - new_loop->body = std::move(outer_block_body); - outer_block_body = For(new_loop); + new_loop->body = std::move(stmt); + stmt = For(new_loop); } - - // Step 4: Generate the outer block and block realize. - return BlockRealize(/*iter_values=*/std::move(extractor.outer_bindings), - /*predicate=*/std::move(predicate), - /*block=*/ - Block(/*iter_vars=*/std::move(extractor.outer_iter_vars), // - /*reads=*/std::move(new_reads), // - /*writes=*/std::move(new_writes), // - /*name_hint=*/block->name_hint + "_o", // - /*body=*/std::move(outer_block_body), // - /*init=*/std::move(new_init))); + return stmt; } -StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) { +BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref, + Map* block_sref_reuse, arith::Analyzer* analyzer) { const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); - arith::Analyzer analyzer; - - // Step 1: Check the loop has a single child BlockRealize on the sref tree. + // Step 1: Check and get the only block under `loop`. BlockRealize block_realize = CheckGetSingleChildBlockRealizeOnSRefTree(self, loop_sref); Block block = block_realize->block; StmtSRef block_sref = self->stmt2ref.at(block.get()); - - // Step 2: Collect loops inside and outside loop_sref. - LoopSubspaceCollector collector; - collector.Collect(block_sref, loop_sref); - - // Step 3: Calculate subspace division for the inner loops. + // Step 2: Derive subspace division + std::vector loops; Array> division = - CheckSubspaceDivisible(self->mod, block_realize, collector, &analyzer); - - // Step 4: Generate bindings for the outer block and the inner block based on the result of - // the subspace division. - BlockizedBindingExtractor extractor; - extractor.ExtractBindings(block->iter_vars, division, &analyzer); - const PrimExpr& outer_pred = division.back()[0]->extent; - const PrimExpr& inner_pred = division.back()[1]->extent; - - // Step 5: Substitute the iter vars in the original block with the inner iters after the subspace - // division - Map block_sref_reuse; - InnerIterReplacer replacer(std::move(extractor.inner_iter_subst_map), &analyzer, - &block_sref_reuse); - Block new_block = Downcast(replacer(block)); - - // Step 6: Generate the inner block. - bool outer_reduction = false; // whether there are outer reduction iter vars. - for (const IterVar& iter_var : extractor.outer_iter_vars) { - if (iter_var->iter_type == kCommReduce) { - outer_reduction = true; - } + SubspaceDivide(block_realize, block_sref, loop_sref, &loops, analyzer); + if (division.empty()) { + throw SubspaceNotDivisibleError(self->mod, GetRef(loops.back()), block); } - BlockRealizeNode* inner_block_realize = block_realize.CopyOnWrite(); - inner_block_realize->iter_values = extractor.inner_bindings; - inner_block_realize->predicate = inner_pred; - inner_block_realize->block = new_block; - BlockNode* inner_block = inner_block_realize->block.CopyOnWrite(); - inner_block->iter_vars = extractor.inner_iter_vars; - inner_block->init = NullOpt; - /* Add write regions to read regions if - * 1. there are outer reduction iter vars. - * 2. the init block is defined for current block. - */ - if (outer_reduction && block->init.defined()) { - Array new_reads; - for (const BufferRegion& write_access : inner_block->writes) { - new_reads.push_back(write_access); - } - for (const BufferRegion& read_access : inner_block->reads) { - new_reads.push_back(read_access); + PrimExpr outer_predicate = division.back()[0]->extent; + PrimExpr inner_predicate = division.back()[1]->extent; + // Step 3. Derive block bindings for both outer and inner block. + Array outer_iter_vars; + Array inner_iter_vars; + Array outer_bindings; + Array inner_bindings; + Map block_var_subst = // + DeriveBlockBinding(block->iter_vars, division, // + &outer_iter_vars, &outer_bindings, // + &inner_iter_vars, &inner_bindings); + // Step 4: Do var substitution to adjust to the new block bindings + Map inner_iter_dom; + for (const IterVar& iter : inner_iter_vars) { + inner_iter_dom.Set(iter->var, arith::IntSet::FromRange(iter->dom)); + analyzer->Bind(iter->var, iter->dom); + } + Block block_subst = + Downcast(Substitute(block, block_var_subst, block_sref_reuse, analyzer)); + // Step 5: Generate the inner block. The write regions of the inner blocks will be reduction if + // 1. The original block has init stmt. + // 2. There are outer reduction iter vars. + bool has_outer_reduction = false; + if (block_subst->init.defined()) { + for (const IterVar& iter_var : outer_iter_vars) { + if (iter_var->iter_type == kCommReduce) { + has_outer_reduction = true; + break; + } } - inner_block->reads = std::move(new_reads); } - block_sref_reuse.Set(block, inner_block_realize->block); - + BlockRealize inner_realize = GenerateInner(/*is_write_reduction=*/has_outer_reduction, + /*iter_vars=*/inner_iter_vars, + /*iter_values*/ inner_bindings, + /*predicate=*/inner_predicate, + /*block=*/block_subst); + block_sref_reuse->Set(block, inner_realize->block); // Step 6: Generate the outer block. - BlockRealize outer_realize = - GenerateBlockizedOuterBlock(extractor, new_block, GetRef(inner_block_realize), - collector.inner_loops, outer_pred); - // Step 7: Do the actual replacement - self->Replace(loop_sref, outer_realize, block_sref_reuse); - - // Step 8: Update the cached flags - StmtSRef outer_block_sref = self->stmt2ref.at(outer_realize->block.get()); - StmtSRef scope_root = tir::GetScopeRoot(self, outer_block_sref, /*require_stage_pipeline=*/false); + return BlockRealize( + /*iter_values=*/std::move(outer_bindings), + /*predicate=*/std::move(outer_predicate), + /*block=*/ + Block(/*iter_vars=*/std::move(outer_iter_vars), + /*reads=*/EvalSetRegions(block_subst->reads, inner_iter_dom), + /*writes=*/EvalSetRegions(block_subst->writes, inner_iter_dom), + /*name_hint=*/block_subst->name_hint + "_o", + /*body=*/MakeLoopNest(inner_realize, loops), + /*init=*/ + block_subst->init.defined() // + ? GenerateOuterInit(block_subst->init.value(), inner_realize, loops, + block_subst->name_hint + "_init") + : Optional(NullOpt))); +} + +StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) { + arith::Analyzer analyzer; + Map block_sref_reuse; + BlockRealize blockized = BlockizeImpl(self, loop_sref, &block_sref_reuse, &analyzer); + self->Replace(loop_sref, blockized, block_sref_reuse); + StmtSRef result = self->stmt2ref.at(blockized->block.get()); + StmtSRef scope_root = tir::GetScopeRoot(self, result, /*require_stage_pipeline=*/false); bool scope_block_affine_binding = self->IsAffineBlockBinding(scope_root); self->UpdateScopeBlockInfo(tir::GetBlockRealize(self, scope_root)); self->block_info[scope_root].affine_binding = scope_block_affine_binding; - return outer_block_sref; -} - -/*! - * \brief Update the map from the buffers in the desc to the impl of the tensor - * intrinsic. - * \param intrinsic The tensor intrinsic. - * \param buffer_map The map to be updated. - */ -void RemapTensorIntrinBuffers( - const TensorIntrin& intrinsic, - std::unordered_map* buffer_map) { - ICHECK_EQ(intrinsic->desc->params.size(), intrinsic->impl->params.size()); - for (size_t i = 0; i < intrinsic->desc->params.size(); ++i) { - const Var& lhs_var = intrinsic->desc->params[i]; - const Buffer& lhs_buffer = intrinsic->desc->buffer_map[lhs_var]; - const Var& rhs_var = intrinsic->impl->params[i]; - const Buffer& rhs_buffer = intrinsic->impl->buffer_map[rhs_var]; - (*buffer_map)[rhs_buffer] = lhs_buffer; - } + return result; } -void Tensorize(ScheduleState self, const StmtSRef& block_or_loop_sref, - const TensorIntrin& intrinsic) { - /*! - * Check: - * - Check buffer binding, including type, alignment, shape and etc. - * - Check the sub AST is equal to the desc function. - * - * Mutate: - * - Blockize the sub AST (please refer blockize for details) - * - Bind buffers - * - Mutate the impl of the tensor intrinsic by replacing its buffers with new - * buffers created via match buffer region. - * - Replace the sub tree with the mutated function. - */ - const BlockRealize& desc_block_realize = Downcast(intrinsic->desc->body); - const BlockRealize& impl_block_realize = Downcast(intrinsic->impl->body); - Block impl_block = impl_block_realize->block; - +void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& intrin) { // Step 1: Blockize the subtree rooted at the given loop if needed - StmtSRef block_sref{nullptr}; - if (block_or_loop_sref->StmtAs()) { - block_sref = Blockize(self, block_or_loop_sref); + BlockRealize block_realize{nullptr}; + Optional old_block = NullOpt; + if (sref->stmt->IsInstance()) { + block_realize = GetBlockRealize(self, sref); + old_block = block_realize->block; + } else if (sref->stmt->IsInstance()) { + arith::Analyzer analyzer; + Map block_sref_reuse; + block_realize = BlockizeImpl(self, sref, &block_sref_reuse, &analyzer); } else { - ICHECK(block_or_loop_sref->StmtAs()); - block_sref = block_or_loop_sref; + LOG(FATAL) << "TypeError: Tensorize only support For or Block, but gets: " + << GetRef(sref->stmt); + throw; } - const BlockRealize& block_realize = GetBlockRealize(self, block_sref); - - // Step 2: Compare the block with the desc of the tensor intrinsic, find the correspondence - // between buffers in the block and the desc. + PrimFunc intrin_desc = intrin->desc; + PrimFunc intrin_impl = DeepCopy(intrin->impl); + // Step 2: Structural pattern matching TensorizeComparator comparator(self->mod, /*assert_mode=*/true); - comparator.VisitStmt(block_realize, desc_block_realize); - - // Step 3: Find the correspondence between buffers in the current AST and the impl of - // the tensor intrinsic - // Step 3.1: Map from intrinsic func buffer to desc func buffer - std::unordered_map intrin_buffer_map; - RemapTensorIntrinBuffers(intrinsic, &intrin_buffer_map); - // Step 3.2: Map form intrinsic func buffer to current AST buffer - std::unordered_map buffer_map; - for (const auto& pair : intrin_buffer_map) { - auto it = comparator.rhs_buffer_map_.find(pair.second); - ICHECK(it != comparator.rhs_buffer_map_.end()) << pair.second; - buffer_map[pair.first] = it->second; + comparator.VisitStmt(block_realize, intrin_desc->body); + // Step 3: Prepare necessary mapping + // 1) Buffer mapping from intrin impl buffers to intrin desc buffers. + // 2) Buffer mapping from intrin impl buffers to buffers in the current AST. + // 3) Mapping impl buffers to their accessed regions. + std::unordered_map impl2desc; + ICHECK_EQ(intrin_desc->params.size(), intrin_impl->params.size()); + for (int i = 0, n = intrin_desc->params.size(); i < n; ++i) { + const Buffer& desc = intrin_desc->buffer_map[intrin_desc->params[i]]; + const Buffer& impl = intrin_impl->buffer_map[intrin_impl->params[i]]; + impl2desc[impl] = desc; } - - // Step 4: Create MatchBufferRegion for the params of the impl function of the tensor - // intrin to make them subregions of the buffer in the original IR. - std::unordered_map, ObjectPtrHash, ObjectPtrEqual> buffer_region_map; + std::unordered_map impl2cur; + for (const auto& pair : impl2desc) { + const Buffer& impl = pair.first; + const Buffer& desc = pair.second; + ICHECK(comparator.rhs_buffer_map_.count(desc)); + impl2cur[impl] = comparator.rhs_buffer_map_[desc]; + } + std::unordered_map, ObjectPtrHash, ObjectPtrEqual> impl2region; + Block impl_block = Downcast(intrin_impl->body)->block; for (const BufferRegion& read : impl_block->reads) { - buffer_region_map.emplace(read->buffer, read->region); + impl2region.emplace(read->buffer, read->region); } for (const BufferRegion& write : impl_block->writes) { - buffer_region_map.emplace(write->buffer, write->region); + impl2region.emplace(write->buffer, write->region); } + // Step 4: Create MatchBufferRegion for the params of the impl function of the tensor + // intrin to make them subregions of the buffer in the original IR. Array match_buffer_regions; - match_buffer_regions.reserve(intrinsic->impl->params.size()); - for (size_t i = 0; i < intrinsic->impl->params.size(); ++i) { - const auto& param = intrinsic->impl->params[i]; - const auto& buffer = intrinsic->impl->buffer_map.at(param); - const auto& source = buffer_map.at(buffer); - // add the detected base indices to each buffer access region of the tensor intrinsic - Region old_region = buffer_region_map.at(buffer); - const auto& indices_base = comparator.buffer_indices_.at(source); + match_buffer_regions.reserve(intrin_impl->params.size()); + for (int i = 0, n = intrin_impl->params.size(); i < n; ++i) { + const Buffer& impl = intrin_impl->buffer_map.at(intrin_impl->params[i]); + const Buffer& cur = impl2cur.at(impl); + const Array& old_region = impl2region.at(impl); + const std::vector& indices_base = comparator.buffer_indices_.at(cur); int offset = static_cast(indices_base.size()) - static_cast(old_region.size()); ICHECK(offset >= 0); - Region new_region; - new_region.reserve(source->shape.size()); + Array new_region; + new_region.reserve(cur->shape.size()); for (int i = 0; i < offset; i++) { - new_region.push_back(Range::FromMinExtent(indices_base[i], 1)); + PrimExpr min = indices_base[i]; + PrimExpr extent = make_const(min.dtype(), 1); + new_region.push_back(Range::FromMinExtent(min, extent)); } for (int i = 0; i < static_cast(old_region.size()); i++) { - new_region.push_back(Range::FromMinExtent(indices_base[i + offset], old_region[i]->extent)); + PrimExpr min = indices_base[i + offset]; + PrimExpr extent = old_region[i]->extent; + new_region.push_back(Range::FromMinExtent(min, extent)); } - match_buffer_regions.push_back(MatchBufferRegion(buffer, BufferRegion(source, new_region))); + match_buffer_regions.push_back(MatchBufferRegion(impl, BufferRegion(cur, new_region))); } - // Step 5: Replace the subtree in the original IR with the tensor intrin impl. - ObjectPtr new_block_ptr = make_object(*block_realize->block.get()); - new_block_ptr->body = impl_block->body; - ICHECK(new_block_ptr->match_buffers.empty()); - new_block_ptr->match_buffers = std::move(match_buffer_regions); - Block new_block(new_block_ptr); - - self->Replace(block_sref, new_block, {{block_realize->block, new_block}}); - + { + BlockNode* block = block_realize.CopyOnWrite()->block.CopyOnWrite(); + block->body = impl_block->body; + block->match_buffers = std::move(match_buffer_regions); + } + if (old_block.defined()) { + self->Replace(sref, block_realize->block, {{old_block.value(), block_realize->block}}); + } else { + self->Replace(sref, block_realize, {}); + } // Step 6: Update the cached flags. - StmtSRef scope_root = tir::GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); - self->UpdateScopeBlockInfo(static_cast(scope_root->stmt)->body); + StmtSRef result = self->stmt2ref.at(block_realize->block.get()); + StmtSRef scope_root = tir::GetScopeRoot(self, result, /*require_stage_pipeline=*/false); + self->UpdateScopeBlockInfo(scope_root->StmtAs()->body); } /******** InstructionKind Registration ********/ diff --git a/tests/python/unittest/test_tir_schedule_blockize.py b/tests/python/unittest/test_tir_schedule_blockize.py index 481421cfdf78f..6d13281320c00 100644 --- a/tests/python/unittest/test_tir_schedule_blockize.py +++ b/tests/python/unittest/test_tir_schedule_blockize.py @@ -15,12 +15,10 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-function-docstring,missing-module-docstring -import sys -import pytest import tvm import tvm.testing -from tvm.script import tir as T from tvm import tir +from tvm.script import tir as T from tvm.tir.schedule.testing import verify_trace_roundtrip # fmt: off @@ -33,177 +31,219 @@ def single_elementwise(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128 vi, vj = T.axis.remap("SS", [i, j]) B[vi, vj] = A[vi, vj] * 2.0 - -@T.prim_func -def single_elementwise_blockized1( - A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"] -) -> None: - with T.block("blockized_B"): - vio = T.axis.spatial(1, 0) - vjo = T.axis.spatial(1, 0) - T.reads(A[0:128, 0:128]) - T.writes(B[0:128, 0:128]) - for i, j in T.grid(128, 128): - with T.block("B"): - vi, vj = T.axis.remap("SS", [i, j]) - T.reads(A[vi, vj]) - T.writes(B[vi, vj]) - B[vi, vj] = A[vi, vj] * T.float32(2) +# fmt: on +# pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks -@T.prim_func -def single_elementwise_blockized2( - A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"] -) -> None: - for i in T.serial(128): +def test_blockize_outer(): + @T.prim_func + def after_blockize_outer( + A: T.Buffer[(128, 128), "float32"], + B: T.Buffer[(128, 128), "float32"], + ) -> None: with T.block("blockized_B"): - vi = T.axis.spatial(128, i) + vio = T.axis.spatial(1, 0) vjo = T.axis.spatial(1, 0) - T.reads(A[vi, 0:128]) - T.writes(B[vi, 0:128]) - for j in T.serial(128): - with T.block("B"): - vj = T.axis.remap("S", [j]) - T.reads(A[vi, vj]) - T.writes(B[vi, vj]) - B[vi, vj] = A[vi, vj] * T.float32(2) - - -@T.prim_func -def two_elementwise(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None: - B = T.alloc_buffer([128, 128], dtype="float32") - for i, j in T.grid(128, 128): - with T.block("B"): - vi, vj = T.axis.remap("SS", [i, j]) - T.reads(A[vi, vj]) - T.writes(B[vi, vj]) - B[vi, vj] = A[vi, vj] * T.float32(2) - for i, j in T.grid(128, 128): - with T.block("C"): - vi, vj = T.axis.remap("SS", [i, j]) - T.reads(B[vi, vj]) - T.writes(C[vi, vj]) - C[vi, vj] = B[vi, vj] + T.float32(1) - - -@T.prim_func -def two_elementwise_blockized( - A: T.Buffer[(128, 128), "float32"], - C: T.Buffer[(128, 128), "float32"] -) -> None: - B = T.alloc_buffer([128, 128], dtype="float32") - for i_0, j_0 in T.grid(8, 8): - with T.block("blockized_B"): - vio, vjo = T.axis.remap("SS", [i_0, j_0]) - T.reads(A[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16]) - T.writes(B[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16]) - for i_1, j_1 in T.grid(16, 16): + for i, j in T.grid(128, 128): with T.block("B"): - vi, vj = T.axis.remap("SS", [i_1, j_1]) - T.reads(A[vio * 16 + vi, vjo * 16 + vj]) - T.writes(B[vio * 16 + vi, vjo * 16 + vj]) - B[vio * 16 + vi, vjo * 16 + vj] = A[vio * 16 + vi, vjo * 16 + vj] * T.float32(2) - with T.block("blockized_C"): - vio, vjo = T.axis.remap("SS", [i_0, j_0]) - T.reads(B[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16]) - T.writes(C[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16]) - for ax0, ax1 in T.grid(16, 16): - with T.block("C"): - vi, vj = T.axis.remap("SS", [ax0, ax1]) - T.reads(B[vio * 16 + vi, vjo * 16 + vj]) - T.writes(C[vio * 16 + vi, vjo * 16 + vj]) - C[vio * 16 + vi, vjo * 16 + vj] = B[vio * 16 + vi, vjo * 16 + vj] + T.float32(1) - - -@T.prim_func -def rowsum(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128,), "float32"]) -> None: - for k, i in T.grid(128, 128): - with T.block("B"): - vk, vi = T.axis.remap("RS", [k, i]) - with T.init(): - B[vi] = 0.0 - B[vi] = B[vi] + A[vi, vk] - - -@T.prim_func -def rowsum_blockized(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128,), "float32"]) -> None: - with T.block("blockized_B"): - vko = T.axis.R(1, 0) - vio = T.axis.S(1, 0) - with T.init(): - for i1 in T.serial(0, 128): - with T.block("B_init"): - vi_init = T.axis.S(128, i1) - B[vi_init] = T.float32(0) - for i0, i1_1 in T.grid(128, 128): - with T.block("B"): - vk, vi = T.axis.remap("RS", [i0, i1_1]) - B[vi] = B[vi] + A[vi, vk] + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = A[vi, vj] * 2.0 - -# fmt: off -# pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks - -def test_blockize_outer(): func = single_elementwise - # schedule s = tir.Schedule(func, debug_mask="all") - B = s.get_block("B") - x, y = s.get_loops(B) + x, _ = s.get_loops(s.get_block("B")) s.blockize(x) - print(s.mod['main'].script()) - tvm.ir.assert_structural_equal(s.mod["main"], single_elementwise_blockized1) + tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_outer) verify_trace_roundtrip(sch=s, mod=func) def test_blockize_inner(): + @T.prim_func + def after_blockize_inner( + A: T.Buffer[(128, 128), "float32"], + B: T.Buffer[(128, 128), "float32"], + ) -> None: + for i in T.serial(128): + with T.block("blockized_B"): + vi = T.axis.spatial(128, i) + vjo = T.axis.spatial(1, 0) + for j in T.serial(128): + with T.block("B"): + vj = T.axis.remap("S", [j]) + B[vi, vj] = A[vi, vj] * 2.0 + func = single_elementwise - # schedule s = tir.Schedule(func, debug_mask="all") - B = s.get_block("B") - x, y = s.get_loops(B) + _, y = s.get_loops(s.get_block("B")) s.blockize(y) - tvm.ir.assert_structural_equal(s.mod["main"], single_elementwise_blockized2) + tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_inner) verify_trace_roundtrip(sch=s, mod=func) def test_two_elementwise_blockize_reverse_compute_at(): - func = two_elementwise + @T.prim_func + def before_blockize_rca( + A: T.Buffer[(128, 128), "float32"], + C: T.Buffer[(128, 128), "float32"], + ) -> None: + B = T.alloc_buffer([128, 128], dtype="float32") + for i, j in T.grid(8, 8): + with T.block("B_o"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) + T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) + for i_1, j_1 in T.grid(16, 16): + with T.block("B"): + vi_i, vj_i = T.axis.remap("SS", [i_1, j_1]) + T.reads(A[vi * 16 + vi_i, vj * 16 + vj_i]) + T.writes(B[vi * 16 + vi_i, vj * 16 + vj_i]) + B[vi * 16 + vi_i, vj * 16 + vj_i] = A[vi * 16 + vi_i, vj * 16 + vj_i] * 2.0 + for ax0, ax1 in T.grid(16, 16): + with T.block("C"): + vi = T.axis.spatial(128, i * 16 + ax0) + vj = T.axis.spatial(128, j * 16 + ax1) + T.reads(B[vi, vj]) + T.writes(C[vi, vj]) + C[vi, vj] = B[vi, vj] + 1.0 + + @T.prim_func + def after_blockize_rca( + A: T.Buffer[(128, 128), "float32"], + C: T.Buffer[(128, 128), "float32"], + ) -> None: + B = T.alloc_buffer([128, 128], dtype="float32") + for i, j in T.grid(8, 8): + with T.block("B_o"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) + T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) + for i_1, j_1 in T.grid(16, 16): + with T.block("B"): + vi_i, vj_i = T.axis.remap("SS", [i_1, j_1]) + T.reads(A[vi * 16 + vi_i, vj * 16 + vj_i]) + T.writes(B[vi * 16 + vi_i, vj * 16 + vj_i]) + B[vi * 16 + vi_i, vj * 16 + vj_i] = A[vi * 16 + vi_i, vj * 16 + vj_i] * 2.0 + with T.block("C_o"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) + T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) + for ax0, ax1 in T.grid(16, 16): + with T.block("C"): + vi_i, vj_i = T.axis.remap("SS", [ax0, ax1]) + T.reads(B[vi * 16 + vi_i, vj * 16 + vj_i]) + T.writes(C[vi * 16 + vi_i, vj * 16 + vj_i]) + C[vi * 16 + vi_i, vj * 16 + vj_i] = B[vi * 16 + vi_i, vj * 16 + vj_i] + 1.0 + + func = before_blockize_rca s = tir.Schedule(func, debug_mask="all") - B = s.get_block("B") - C = s.get_block("C") - x, y = s.get_loops(B) - xo, xi = s.split(x, factors=[None, 16]) - yo, yi = s.split(y, factors=[None, 16]) - s.reorder(xo, yo, xi, yi) - s.blockize(xi) - s.reverse_compute_at(C, yo) - s.blockize(s.get_loops(C)[-2]) - tvm.ir.assert_structural_equal(s.mod["main"], two_elementwise_blockized) + _, _, x, _ = s.get_loops(s.get_block("C")) + s.blockize(x) + tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_rca) verify_trace_roundtrip(sch=s, mod=func) def test_two_elementwise_blockize_compute_at(): - func = two_elementwise + @T.prim_func + def before_blockize_compute_at( + A: T.Buffer[(128, 128), "float32"], + C: T.Buffer[(128, 128), "float32"], + ) -> None: + # body + # with T.block("root") + B = T.alloc_buffer([128, 128], dtype="float32") + for i_0, j_0 in T.grid(8, 8): + for ax0, ax1 in T.grid(16, 16): + with T.block("B"): + vi = T.axis.spatial(128, i_0 * 16 + ax0) + vj = T.axis.spatial(128, j_0 * 16 + ax1) + T.reads(A[vi, vj]) + T.writes(B[vi, vj]) + B[vi, vj] = A[vi, vj] * 2.0 + with T.block("C_o"): + vi_o, vj_o = T.axis.remap("SS", [i_0, j_0]) + T.reads(B[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16]) + T.writes(C[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16]) + for i_1, j_1 in T.grid(16, 16): + with T.block("C"): + vi_i, vj_i = T.axis.remap("SS", [i_1, j_1]) + T.reads(B[vi_o * 16 + vi_i, vj_o * 16 + vj_i]) + T.writes(C[vi_o * 16 + vi_i, vj_o * 16 + vj_i]) + C[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = ( + B[vi_o * 16 + vi_i, vj_o * 16 + vj_i] + 1.0 + ) + + @T.prim_func + def after_blockize_compute_at( + A: T.Buffer[(128, 128), "float32"], + C: T.Buffer[(128, 128), "float32"], + ) -> None: + B = T.alloc_buffer([128, 128], dtype="float32") + for i_0, j_0 in T.grid(8, 8): + with T.block("B_o"): + vi_o, vj_o = T.axis.remap("SS", [i_0, j_0]) + T.reads(A[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16]) + T.writes(B[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16]) + for ax0, ax1 in T.grid(16, 16): + with T.block("B"): + vi_i, vj_i = T.axis.remap("SS", [ax0, ax1]) + T.reads(A[vi_o * 16 + vi_i, vj_o * 16 + vj_i]) + T.writes(B[vi_o * 16 + vi_i, vj_o * 16 + vj_i]) + B[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = ( + A[vi_o * 16 + vi_i, vj_o * 16 + vj_i] * 2.0 + ) + with T.block("C_o"): + vi_o, vj_o = T.axis.remap("SS", [i_0, j_0]) + T.reads(B[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16]) + T.writes(C[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16]) + for i_1, j_1 in T.grid(16, 16): + with T.block("C"): + vi_i, vj_i = T.axis.remap("SS", [i_1, j_1]) + T.reads(B[vi_o * 16 + vi_i, vj_o * 16 + vj_i]) + T.writes(C[vi_o * 16 + vi_i, vj_o * 16 + vj_i]) + C[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = ( + B[vi_o * 16 + vi_i, vj_o * 16 + vj_i] + 1.0 + ) + + func = before_blockize_compute_at s = tir.Schedule(func, debug_mask="all") - B = s.get_block("B") - C = s.get_block("C") - x, y = s.get_loops(C) - xo, xi = s.split(x, factors=[None, 16]) - yo, yi = s.split(y, factors=[None, 16]) - s.reorder(xo, yo, xi, yi) - s.blockize(xi) - s.compute_at(B, yo) - s.blockize(s.get_loops(B)[-2]) - tvm.ir.assert_structural_equal(s.mod["main"], two_elementwise_blockized) + _, _, x, _ = s.get_loops(s.get_block("B")) + s.blockize(x) + tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_compute_at) verify_trace_roundtrip(sch=s, mod=func) def test_blockize_init_loops(): + @T.prim_func + def rowsum(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128,), "float32"]) -> None: + for k, i in T.grid(128, 128): + with T.block("B"): + vk, vi = T.axis.remap("RS", [k, i]) + with T.init(): + B[vi] = 0.0 + B[vi] = B[vi] + A[vi, vk] + + @T.prim_func + def after_rowsum_blockize( + A: T.Buffer[(128, 128), "float32"], + B: T.Buffer[(128,), "float32"], + ) -> None: + with T.block("blockized_B"): + vko = T.axis.R(1, 0) + vio = T.axis.S(1, 0) + with T.init(): + for i1 in T.serial(0, 128): + with T.block("B_init"): + vi_init = T.axis.S(128, i1) + B[vi_init] = T.float32(0) + for i0, i1_1 in T.grid(128, 128): + with T.block("B"): + vk, vi = T.axis.remap("RS", [i0, i1_1]) + B[vi] = B[vi] + A[vi, vk] + s = tir.Schedule(rowsum, debug_mask="all") k, _ = s.get_loops(s.get_block("B")) s.blockize(k) - tvm.ir.assert_structural_equal(s.mod["main"], rowsum_blockized) + tvm.ir.assert_structural_equal(s.mod["main"], after_rowsum_blockize) verify_trace_roundtrip(sch=s, mod=rowsum) From 4b5dd136d764fbef5f552ffce0759232c138e4e2 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 13 Jul 2022 11:17:53 -0500 Subject: [PATCH 111/111] [Arith] Updated BufferDomainTouched to use IRVisitorWithAnalyzer (#11970) * [Arith] Allow binding of Var in IntSetAnalyzer The other four subanalyzers in `arith::Analyzer` can each be provided with variable bindings/constraints that are remembered internally. This adds the same capability to `IntSetAnalyzer`, rather than requiring users to independently track and maintain a `Map` containing the domain of each variable, and applies bindings/constraints alongside the other subanalyzers. * [Arith] Updated IRVisitorWithAnalyzer to mimic IRMutatorWithAnalyzer Previously, `IRVisitorWithAnalyzer` did not allow subclassing, and could only be used to collect bounds of variables along an entire statement, and could not be used to perform scope-dependent analysis. This commit removes `final` from `IRVisitorWithAnalyzer` and provides the same scope-based constraints/bindings during iteration as are provided by `IRMutatorWithAnalyzer`. * [Arith] Moved IRVisitorWithAnalyzer to tvm::arith namespace Changing for consistency, since `IRVisitorWithAnalyzer` it is part of the `src/arith` directory and the analogous `IRMutatorWithAnalyzer` is already part of the `arith` namespace. * [Arith] Updated BufferDomainTouched to use IRVisitorWithAnalyzer This used the earlier changes to allow subclasses of `IRVisitorWithAnalyzer`, and to expose binding/constraints to `IntSetAnalyzer`. * Avoid accidental Bind with dynamic Range * [Arith] Do not visit SelectNode in IRVisitorWithAnalyzer Because both sides of a `Select` node are visited regardless of the condition, the `SelectNode::condition` should not be treated as a known value. * [Arith][IntSet] Track global and scope-dependent bounds separately Resolves a bug that was found in CI, where an earlier scope-dependent constraint was treated as a conflict by a later global bound. * [Arith] Recovery function for each subanalyzer This way, if a subanalyzer throws an exception during `EnterConstraint`, the other subanalyzers are still appropriately backed out of the constraint. * [Arith][IntSet] Use CanProve instead of CanProveGreaterEqual The `min_value - max_value` in the `CanProveGreaterEqual` argument can result in an exception being thrown for unsigned integers where subtraction would wrap. * [Arith] Allow vector expressions in IntSet::operator(PrimExpr) Since these are tracked when lowering expressions, should allow post-vectorization expressions. To maintain previous behavior, this only applies when using the automatically tracked `Map dom_map_`. If an explicit domain map is passed, the previous behavior of raising an error for vectorized expressions still occurs. * Avoid comparisons between integer and handle datatypes * [Arith] IntSet, Combine() extension Previously, the Combine() method didn't handle values without a known lower bound, for boolean operators. * Added docstring * Naming consistency of `IntSetAnalyzer` methods. To be consistent with other subanalyzers, using "Update" when providing the analyzer with the same data structure as is used internally, and "Bind" used when providing it with something that must be converted to the internal data structure. --- include/tvm/arith/analyzer.h | 46 +++++- src/arith/analyzer.cc | 26 ++-- src/arith/domain_touched.cc | 43 ++---- src/arith/int_set.cc | 211 ++++++++++++++++++++++++-- src/arith/ir_visitor_with_analyzer.cc | 126 +++++++++++++++ src/arith/ir_visitor_with_analyzer.h | 45 +++--- src/tir/transforms/storage_flatten.cc | 1 + src/tir/transforms/texture_flatten.cc | 1 + 8 files changed, 409 insertions(+), 90 deletions(-) create mode 100644 src/arith/ir_visitor_with_analyzer.cc diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h index 3704eff33ec28..ceb9f574f2c9c 100644 --- a/include/tvm/arith/analyzer.h +++ b/include/tvm/arith/analyzer.h @@ -135,7 +135,7 @@ class ConstIntBoundAnalyzer { * * \param var The variable of interest. * \param info The bound information. - * \param allow_override Whether do we allow override of existing information. + * \param allow_override whether we allow override of existing information. */ TVM_DLL void Update(const Var& var, const ConstIntBound& info, bool allow_override = false); /*! @@ -224,7 +224,7 @@ class ModularSetAnalyzer { * * \param var The variable of interest. * \param info The bound information. - * \param allow_override Whether do we allow override of existing information. + * \param allow_override whether we allow override of existing information. */ TVM_DLL void Update(const Var& var, const ModularSet& info, bool allow_override = false); @@ -263,10 +263,16 @@ class RewriteSimplifier { * * \param var The variable of interest. * \param new_expr - * \param allow_override Whether do we allow override of existing information. + * \param allow_override Whether we allow override of existing information. */ TVM_DLL void Update(const Var& var, const PrimExpr& new_expr, bool allow_override = false); + /*! + * \brief Update the internal state to enter constraint. + * \param constraint A constraint expression. + * + * \return an exit function that must be called to cleanup the constraint can be nullptr. + */ std::function EnterConstraint(const PrimExpr& constraint); private: @@ -297,7 +303,7 @@ class CanonicalSimplifier { * * \param var The variable of interest. * \param new_expr - * \param allow_override Whether do we allow override of existing information. + * \param allow_override whether we allow override of existing information. */ TVM_DLL void Update(const Var& var, const PrimExpr& new_expr, bool allow_override = false); @@ -347,7 +353,7 @@ class ConstraintContext { /*! \brief The constraint */ PrimExpr constraint_; /*! \brief function to be called in recovery */ - std::function exit_; + std::vector> recovery_functions_; }; /*! @@ -365,6 +371,36 @@ class IntSetAnalyzer { */ TVM_DLL IntSet operator()(const PrimExpr& expr, const Map& dom_map); + /*! + * \brief Find a symbolic integer set that contains all possible + * values of expr given the domain of each variables, using + * the domain map defined by bound variables. + * + * \param expr The expression of interest. + * \return the result of the analysis. + */ + TVM_DLL IntSet operator()(const PrimExpr& expr); + + /*! + * \brief Update binding of var to a new expression. + * + * \param var The variable of interest. + * \param new_interval_set The set of allowed values for this var. + * \param allow_override whether we allow override of existing information. + */ + TVM_DLL void Update(const Var& var, const IntSet& new_interval_set, bool allow_override = false); + + /*! + * \brief Update binding of var to a new expression. + * + * \param var The variable of interest. + * \param new_range The range of allowed values for this var. + * \param allow_override whether we allow override of existing information. + */ + TVM_DLL void Bind(const Var& var, const Range& new_range, bool allow_override = false); + + std::function EnterConstraint(const PrimExpr& constraint); + private: friend class Analyzer; explicit IntSetAnalyzer(Analyzer* parent); diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc index b922138057e96..f32c9b2ff4cf3 100644 --- a/src/arith/analyzer.cc +++ b/src/arith/analyzer.cc @@ -44,6 +44,7 @@ void Analyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) { this->modular_set.Update(var, this->modular_set(new_expr), allow_override); this->rewrite_simplify.Update(var, new_expr, allow_override); this->canonical_simplify.Update(var, new_expr, allow_override); + this->int_set.Update(var, this->int_set(new_expr), allow_override); } void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) { @@ -52,6 +53,7 @@ void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) { this->Bind(var, range->min, allow_override); } else { this->const_int_bound.Bind(var, range, allow_override); + this->int_set.Bind(var, range, allow_override); } // skip modular_set // skip rewrite simplify @@ -64,22 +66,22 @@ void Analyzer::Bind(const Map& variables, bool allow_override) { } void ConstraintContext::EnterWithScope() { - ICHECK(exit_ == nullptr); + ICHECK(recovery_functions_.size() == 0); // entering the scope. - auto f0 = analyzer_->const_int_bound.EnterConstraint(constraint_); - auto f1 = analyzer_->modular_set.EnterConstraint(constraint_); - auto f2 = analyzer_->rewrite_simplify.EnterConstraint(constraint_); - // recovery function. - exit_ = [f0, f1, f2]() { - if (f2 != nullptr) f2(); - if (f1 != nullptr) f1(); - if (f0 != nullptr) f0(); - }; + recovery_functions_.push_back(analyzer_->const_int_bound.EnterConstraint(constraint_)); + recovery_functions_.push_back(analyzer_->modular_set.EnterConstraint(constraint_)); + recovery_functions_.push_back(analyzer_->rewrite_simplify.EnterConstraint(constraint_)); + recovery_functions_.push_back(analyzer_->int_set.EnterConstraint(constraint_)); } void ConstraintContext::ExitWithScope() { - ICHECK(exit_ != nullptr); - exit_(); + while (recovery_functions_.size()) { + auto& func = recovery_functions_.back(); + if (func) { + func(); + } + recovery_functions_.pop_back(); + } } bool Analyzer::CanProveGreaterEqual(const PrimExpr& expr, int64_t lower_bound) { diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc index 403ea47f4e61b..d2c5d79a09606 100644 --- a/src/arith/domain_touched.cc +++ b/src/arith/domain_touched.cc @@ -30,6 +30,8 @@ #include #include +#include "ir_visitor_with_analyzer.h" + namespace tvm { namespace arith { @@ -56,7 +58,7 @@ using BufferDomainAccess = std::tuple; } // namespace // Find Read region of the tensor in the stmt. -class BufferTouchedDomain final : public StmtExprVisitor { +class BufferTouchedDomain final : public IRVisitorWithAnalyzer { public: BufferTouchedDomain(const Stmt& stmt) { operator()(stmt); } @@ -90,39 +92,17 @@ class BufferTouchedDomain final : public StmtExprVisitor { return ret; } - void VisitStmt_(const ForNode* op) final { - const VarNode* var = op->loop_var.get(); - dom_map_[var] = IntSet::FromRange(Range::FromMinExtent(op->min, op->extent)); - StmtExprVisitor::VisitStmt_(op); - dom_map_.erase(var); - } - - void VisitStmt_(const LetStmtNode* op) final { - dom_map_[op->var.get()] = arith::EvalSet(op->value, dom_map_); - StmtExprVisitor::VisitStmt_(op); - dom_map_.erase(op->var.get()); - } - - /* TODO: Thread extent unitest not generated.*/ - void VisitStmt_(const AttrStmtNode* op) final { - if (op->attr_key == tir::attr::thread_extent) { - const IterVarNode* thread_axis = op->node.as(); - ICHECK(thread_axis); - const VarNode* var = thread_axis->var.get(); - dom_map_[var] = IntSet::FromRange(Range(make_zero(op->value.dtype()), op->value)); - StmtExprVisitor::VisitStmt_(op); - dom_map_.erase(var); - } else { - StmtExprVisitor::VisitStmt_(op); - } - } + private: + using Parent = IRVisitorWithAnalyzer; + using Parent::VisitExpr_; + using Parent::VisitStmt_; void VisitExpr_(const BufferLoadNode* op) final { // Record load-exclusive buffer access Touch(&std::get(buffer_access_map_[op->buffer.get()]).set, op->indices); // Record load-store inclusive buffer access Touch(&std::get(buffer_access_map_[op->buffer.get()]).set, op->indices); - StmtExprVisitor::VisitExpr_(op); + Parent::VisitExpr_(op); } void VisitStmt_(const BufferStoreNode* op) final { @@ -130,11 +110,11 @@ class BufferTouchedDomain final : public StmtExprVisitor { Touch(&std::get(buffer_access_map_[op->buffer.get()]).set, op->indices); // Record load-store inclusive buffer access Touch(&std::get(buffer_access_map_[op->buffer.get()]).set, op->indices); - StmtExprVisitor::VisitStmt_(op); + Parent::VisitStmt_(op); } private: - void Touch(BufferTouches* bounds, const Array& args) const { + void Touch(BufferTouches* bounds, const Array& args) { if (args.size() > bounds->size()) { bounds->resize(args.size()); } @@ -142,13 +122,12 @@ class BufferTouchedDomain final : public StmtExprVisitor { if (args[i].as()) { (*bounds)[i].emplace_back(IntSet::Vector(args[i])); } else { - (*bounds)[i].emplace_back(EvalSet(args[i], dom_map_)); + (*bounds)[i].emplace_back(analyzer_.int_set(args[i])); } } } std::unordered_map buffer_access_map_; - std::unordered_map dom_map_; }; Region DomainTouched(const Stmt& stmt, const Buffer& buffer, bool consider_loads, diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc index 48fae479b042b..6d48ad1ed1518 100644 --- a/src/arith/int_set.cc +++ b/src/arith/int_set.cc @@ -31,6 +31,7 @@ #include #include +#include "constraint_extract.h" #include "interval_set.h" #include "pattern_match.h" @@ -63,7 +64,7 @@ IntervalSet Intersect(Analyzer* analyzer, IntervalSet a, IntervalSet b) { PrimExpr min_value = max(a->min_value, b->min_value); if ((max_value.dtype().is_int() || max_value.dtype().is_uint()) && (min_value.dtype().is_int() || min_value.dtype().is_uint()) && - analyzer->CanProveGreaterEqual(min_value - max_value, 1)) { + analyzer->CanProve(max_value < min_value)) { return IntervalSet::Empty(); } else { return IntervalSet(min_value, max_value); @@ -105,14 +106,14 @@ TVM_DECLARE_LOGICAL_OP(Not); * \note this can possibly relax the set. */ template -inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, DataType dtype) { if (a->IsSinglePoint() && b->IsSinglePoint()) { PrimExpr res = TryConstFold(a->min_value, b->min_value); if (!res.defined()) res = Op(a->min_value, b->min_value); return IntervalSet::SinglePoint(res); } if (is_logical_op::value) { - return IntervalSet(make_const(a->min_value.dtype(), 0), make_const(a->min_value.dtype(), 1)); + return IntervalSet(make_const(dtype, 0), make_const(dtype, 1)); } if (a->IsEmpty()) return a; if (b->IsEmpty()) return b; @@ -122,7 +123,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { } template <> -inline IntervalSet Combine(Analyzer* analyer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(a->min_value + b->min_value); } @@ -136,7 +138,8 @@ inline IntervalSet Combine(Analyzer* analyer, IntervalSet a, IntervalS } template <> -inline IntervalSet Combine(Analyzer* analyer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(a->min_value - b->min_value); } @@ -150,7 +153,8 @@ inline IntervalSet Combine(Analyzer* analyer, IntervalSet a, IntervalS } template <> -inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(a->min_value * b->min_value); } @@ -183,7 +187,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, Interval } template <> -inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(a->min_value / b->min_value); } @@ -216,7 +221,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, Interval } template <> -inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(truncmod(a->min_value, b->min_value)); } @@ -244,7 +250,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, Interval } template <> -inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(floordiv(a->min_value, b->min_value)); } @@ -277,7 +284,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, Int } template <> -inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(floormod(a->min_value, b->min_value)); } @@ -294,7 +302,10 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, Int // a mod b = a - (a / b) * b if a_max / b == a_min / b auto qmax = a->HasUpperBound() ? floordiv(a->max_value, divisor) : pos_inf(); auto qmin = a->HasLowerBound() ? floordiv(a->min_value, divisor) : neg_inf(); - if (analyzer->CanProve(qmax == qmin)) { + // We can compare +/- inf against each other, but cannot use + // operator== between the symbolic limits and an integer. + bool compatible_dtypes = !(qmin.dtype().is_handle() ^ qmax.dtype().is_handle()); + if (compatible_dtypes && analyzer->CanProve(qmax == qmin)) { auto tmax = a->max_value - divisor * qmin; auto tmin = a->min_value - divisor * qmin; return IntervalSet(tmin, tmax); @@ -311,7 +322,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, Int } template <> -inline IntervalSet Combine(Analyzer* analzyer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analzyer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(max(a->min_value, b->min_value)); } @@ -321,7 +333,8 @@ inline IntervalSet Combine(Analyzer* analzyer, IntervalSet a, Interval } template <> -inline IntervalSet Combine(Analyzer* analzyer, IntervalSet a, IntervalSet b) { +inline IntervalSet Combine(Analyzer* analzyer, IntervalSet a, IntervalSet b, + DataType /* dtype */) { if (a->IsSinglePoint() && b->IsSinglePoint()) { return IntervalSet::SinglePoint(min(a->min_value, b->min_value)); } @@ -423,10 +436,12 @@ class IntervalSetEvaluator : public ExprFunctor { int64_t vstride = stride.Eval()->value; if (vstride > 0) { return Combine(analyzer_, base, - IntervalSet(make_zero(t), make_const(t, vstride * op->lanes - 1))); + IntervalSet(make_zero(t), make_const(t, vstride * op->lanes - 1)), + op->dtype); } else { return Combine(analyzer_, base, - IntervalSet(make_const(t, vstride * op->lanes + 1), make_zero(t))); + IntervalSet(make_const(t, vstride * op->lanes + 1), make_zero(t)), + op->dtype); } } DLOG(WARNING) << "cannot evaluate set on expression " << GetRef(op); @@ -490,7 +505,7 @@ class IntervalSetEvaluator : public ExprFunctor { if (MatchPoint(a, op->a) && MatchPoint(b, op->b)) { return IntervalSet::SinglePoint(GetRef(op)); } - return Combine(analyzer_, a, b); + return Combine(analyzer_, a, b, op->dtype); } // recursive depth @@ -509,8 +524,37 @@ class IntSetAnalyzer::Impl { return IntervalSetEvaluator(analyzer_, dom_map).Eval(expr); } + IntSet Eval(const PrimExpr& expr) const { + return IntervalSetEvaluator(analyzer_, GetCurrentBounds(), true).Eval(expr); + } + + void Bind(const Var& var, const Range& range, bool allow_override) { + Update(var, IntSet::FromRange(range), allow_override); + } + + void Update(const Var& var, const IntSet& info, bool override_info); + void Bind(const Var& var, const PrimExpr& expr, bool override_info); + std::function EnterConstraint(const PrimExpr& constraint); + private: + // Get the current variable bounds, including both global bounds and + // scope-dependent bounds. + Map GetCurrentBounds() const; + + // Utility function to split a boolean condition into the domain + // bounds implied by that condition. + static std::vector> DetectBoundInfo(const PrimExpr& cond); + + // The parent arith::Analyzer Analyzer* analyzer_; + + // Map of variables to global variable bounds (e.g. loop iterator + // ranges) + Map dom_map_; + + // Map of variables to implicit scope-dependent bounds (e.g. inside + // the body of an if-statement) + Map constraints_; }; IntSetAnalyzer::IntSetAnalyzer(Analyzer* parent) : impl_(new Impl(parent)) {} @@ -521,6 +565,141 @@ IntSet IntSetAnalyzer::operator()(const PrimExpr& expr, const Map& return impl_->Eval(expr, dom_map); } +IntSet IntSetAnalyzer::operator()(const PrimExpr& expr) { return impl_->Eval(expr); } + +void IntSetAnalyzer::Update(const Var& var, const IntSet& info, bool allow_override) { + impl_->Update(var, info, allow_override); +} + +void IntSetAnalyzer::Bind(const Var& var, const Range& range, bool allow_override) { + impl_->Bind(var, range, allow_override); +} + +void IntSetAnalyzer::Impl::Update(const Var& var, const IntSet& info, bool can_override) { + if (!can_override) { + auto it = dom_map_.find(var); + if (it != dom_map_.end()) { + const IntSet& old_info = (*it).second; + + ICHECK(ExprDeepEqual()(old_info.min(), info.min())) + << "Trying to update var \'" << var << "\'" + << " with a different minimum value: " + << "original=" << old_info.min() << ", new=" << info.min(); + + ICHECK(ExprDeepEqual()(old_info.max(), info.max())) + << "Trying to update var \'" << var << "\'" + << " with a different maximum value: " + << "original=" << old_info.max() << ", new=" << info.max(); + } + } + dom_map_.Set(var, info); +} + +void IntSetAnalyzer::Impl::Bind(const Var& var, const PrimExpr& expr, bool can_override) { + Update(var, Eval(expr), can_override); +} + +Map IntSetAnalyzer::Impl::GetCurrentBounds() const { + // If either constraints_ or dom_map_ is empty, return the other to + // avoid constructing a new map. + if (constraints_.empty()) { + return dom_map_; + } else if (dom_map_.empty()) { + return constraints_; + } + + // If neither is empty, construct a merged domain map with + // information from both sources. + Map merged = dom_map_; + for (const auto& pair : constraints_) { + auto it = merged.find(pair.first); + if (it == merged.end()) { + merged.Set(pair.first, pair.second); + } else { + merged.Set(pair.first, Intersect({pair.second, (*it).second})); + } + } + return merged; +} + +std::vector> IntSetAnalyzer::Impl::DetectBoundInfo( + const PrimExpr& constraint) { + PVar x; + PVar limit; + + std::vector> bounds; + for (const PrimExpr& subconstraint : ExtractConstraints(constraint)) { + if ((x <= limit).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval())}); + } else if ((x < limit).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval() - 1)}); + } else if ((x >= limit).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval(), SymbolicLimits::pos_inf_)}); + } else if ((x > limit).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval() + 1, SymbolicLimits::pos_inf_)}); + } else if ((x == limit).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::SinglePoint(limit.Eval())}); + } + + if ((limit >= x).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval())}); + } else if ((limit > x).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval() - 1)}); + } else if ((limit <= x).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval(), SymbolicLimits::pos_inf_)}); + } else if ((limit < x).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval() + 1, SymbolicLimits::pos_inf_)}); + } else if ((limit == x).Match(subconstraint)) { + bounds.push_back({x.Eval(), IntSet::SinglePoint(limit.Eval())}); + } + } + return bounds; +} + +std::function IntSetAnalyzer::EnterConstraint(const PrimExpr& constraint) { + return impl_->EnterConstraint(constraint); +} + +std::function IntSetAnalyzer::Impl::EnterConstraint(const PrimExpr& constraint) { + Map cached_values; + + auto bounds = DetectBoundInfo(constraint); + + if (bounds.size() == 0) return nullptr; + + // Collect the current values of each var that is changes by this + // constraint. + for (const auto& pair : bounds) { + auto it = constraints_.find(pair.first); + if (it == constraints_.end()) { + cached_values.Set(pair.first, IntSet()); + } else { + cached_values.Set(pair.first, (*it).second); + } + } + + // Update all constraints + for (const auto& pair : bounds) { + auto it = constraints_.find(pair.first); + if (it == constraints_.end()) { + constraints_.Set(pair.first, pair.second); + } else { + constraints_.Set(pair.first, Intersect({pair.second, (*it).second})); + } + } + + auto frecover = [cached_values, this]() { + for (const auto& it : cached_values) { + if (it.second.defined()) { + constraints_.Set(it.first, it.second); + } else { + constraints_.erase(it.first); + } + } + }; + return frecover; +} + // Quickly adapt to IntSet interface // TODO(tqchen): revisit IntSet interface as well. Range IntSet::CoverRange(Range max_range) const { diff --git a/src/arith/ir_visitor_with_analyzer.cc b/src/arith/ir_visitor_with_analyzer.cc new file mode 100644 index 0000000000000..75ae22ef9915c --- /dev/null +++ b/src/arith/ir_visitor_with_analyzer.cc @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/arith/ir_visitor_with_analyzer.cc + */ +#include "ir_visitor_with_analyzer.h" + +#include +#include +#include + +namespace tvm { +namespace arith { + +using namespace tir; + +void IRVisitorWithAnalyzer::VisitStmt_(const ForNode* op) { + analyzer_.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent)); + StmtExprVisitor::VisitStmt_(op); +} + +void IRVisitorWithAnalyzer::VisitStmt_(const BlockNode* op) { + for (const auto& iter_var : op->iter_vars) { + analyzer_.Bind(iter_var->var, iter_var->dom); + } + StmtExprVisitor::VisitStmt_(op); +} + +void IRVisitorWithAnalyzer::VisitStmt_(const LetStmtNode* op) { + this->VisitExpr(op->value); + analyzer_.Bind(op->var, op->value); + this->VisitStmt(op->body); +} + +void IRVisitorWithAnalyzer::VisitStmt_(const IfThenElseNode* op) { + this->VisitExpr(op->condition); + + PrimExpr real_condition = ExtractRealCondition(op->condition); + + { + With constraint(&analyzer_, real_condition); + this->VisitStmt(op->then_case); + } + if (op->else_case.defined()) { + With constraint(&analyzer_, analyzer_.rewrite_simplify(Not(real_condition))); + this->VisitStmt(op->else_case); + } +} + +void IRVisitorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) { + if (op->attr_key == tir::attr::thread_extent || op->attr_key == tir::attr::virtual_thread) { + IterVar iv = Downcast(op->node); + ICHECK_NE(iv->thread_tag.length(), 0U); + analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value)); + } + StmtExprVisitor::VisitStmt_(op); +} + +void IRVisitorWithAnalyzer::VisitStmt_(const AssertStmtNode* op) { + this->VisitExpr(op->condition); + this->VisitExpr(op->message); + With constraint(&analyzer_, op->condition); + this->VisitStmt(op->body); +} + +void IRVisitorWithAnalyzer::VisitExpr_(const CallNode* op) { + // add condition context to if_then_else + static auto op_if_then_else = Op::Get("tir.if_then_else"); + if (op->op.same_as(op_if_then_else)) { + PrimExpr cond = op->args[0]; + this->VisitExpr(op->args[0]); + { + With constraint(&analyzer_, cond); + this->VisitExpr(op->args[1]); + } + { + With constraint(&analyzer_, analyzer_.rewrite_simplify(Not(cond))); + this->VisitExpr(op->args[2]); + } + } else { + StmtExprVisitor::VisitExpr_(op); + } +} + +void IRVisitorWithAnalyzer::VisitExpr_(const LetNode* op) { + this->VisitExpr(op->value); + analyzer_.Bind(op->var, op->value); + this->VisitExpr(op->body); +} + +void IRVisitorWithAnalyzer::VisitExpr_(const ReduceNode* op) { + for (const IterVar& iv : op->axis) { + analyzer_.Bind(iv->var, iv->dom); + } + StmtExprVisitor::VisitExpr_(op); +} + +PrimExpr IRVisitorWithAnalyzer::ExtractRealCondition(PrimExpr condition) const { + if (auto call = condition.as()) { + if (call->op.same_as(builtin::likely())) { + return call->args[0]; + } + } + + return condition; +} + +} // namespace arith +} // namespace tvm diff --git a/src/arith/ir_visitor_with_analyzer.h b/src/arith/ir_visitor_with_analyzer.h index 058abc8c7d207..f41a628f3cc6d 100644 --- a/src/arith/ir_visitor_with_analyzer.h +++ b/src/arith/ir_visitor_with_analyzer.h @@ -30,42 +30,37 @@ #include namespace tvm { -namespace tir { +namespace arith { -class IRVisitorWithAnalyzer final : public StmtExprVisitor { +class IRVisitorWithAnalyzer : public tir::StmtExprVisitor { public: PrimExpr Simplify(const PrimExpr& expr) { return analyzer_.Simplify(expr); } - void VisitStmt_(const ForNode* op) { - analyzer_.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent)); - return StmtExprVisitor::VisitStmt_(op); - } + using StmtExprVisitor::VisitExpr_; + using StmtExprVisitor::VisitStmt_; - void VisitStmt_(const AttrStmtNode* op) { - if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) { - IterVar iv = Downcast(op->node); - ICHECK_NE(iv->thread_tag.length(), 0U); - analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value)); - StmtExprVisitor::VisitStmt_(op); - } else { - StmtExprVisitor::VisitStmt_(op); - } - } + void VisitStmt_(const tir::ForNode* op); + void VisitStmt_(const tir::BlockNode* op); + void VisitStmt_(const tir::LetStmtNode* op); + void VisitStmt_(const tir::IfThenElseNode* op); + void VisitStmt_(const tir::AttrStmtNode* op); + void VisitStmt_(const tir::AssertStmtNode* op); + void VisitExpr_(const tir::CallNode* op); + void VisitExpr_(const tir::LetNode* op); + void VisitExpr_(const tir::ReduceNode* op); - void VisitExpr_(const ReduceNode* op) { - // Setup the domain information before simplification. - for (const IterVar& iv : op->axis) { - analyzer_.Bind(iv->var, iv->dom); - } - // Recursively call simplification when necessary. - StmtExprVisitor::VisitExpr_(op); - } + // IRVisitorWithAnalyzer deliberately does not handle Select nodes, + // because both sides of a Select node are visited regardless of the + // condition. protected: /*! \brief internal analyzer field. */ arith::Analyzer analyzer_; + + private: + PrimExpr ExtractRealCondition(PrimExpr condition) const; }; -} // namespace tir +} // namespace arith } // namespace tvm #endif // TVM_ARITH_IR_VISITOR_WITH_ANALYZER_H_ diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc index f2d9aba4fba84..dd236537e9c2a 100644 --- a/src/tir/transforms/storage_flatten.cc +++ b/src/tir/transforms/storage_flatten.cc @@ -47,6 +47,7 @@ namespace tvm { namespace tir { +using arith::IRVisitorWithAnalyzer; using runtime::StorageRank; using runtime::StorageScope; using runtime::ThreadScope; diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc index a607e5914b39e..3c35b73bc8d7c 100644 --- a/src/tir/transforms/texture_flatten.cc +++ b/src/tir/transforms/texture_flatten.cc @@ -38,6 +38,7 @@ namespace tvm { namespace tir { +using arith::IRVisitorWithAnalyzer; using runtime::ApplyTexture2DFlattening; using runtime::DefaultTextureLayoutSeparator; using runtime::IsTextureStorage;