Merge branch 'master' into trt_8.4ga

pytorch · Jun 30, 2022 · d85c327 · d85c327
2 parents a64956e + 5b03083
commit d85c327
Show file tree

Hide file tree

Showing 266 changed files with 3,057 additions and 989 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -0,0 +1,96 @@
+# Use the latest 2.1 version of CircleCI pipeline process engine.
+# See: https://circleci.com/docs/2.0/configuration-reference
+version: 2.1
+
+# Define a job to be invoked later in a workflow.
+# See: https://circleci.com/docs/2.0/configuration-reference/#jobs
+jobs:
+  build:
+    machine:
+    # Primary container image where all steps run.
+      # image: nvcr.io/nvidia/tensorrt:22.01-py3 # does not work with customized image
+      # https://circleci.com/docs/2.0/configuration-reference#available-linux-gpu-images
+      image: ubuntu-2004-cuda-11.4:202110-01
+    resource_class: gpu.nvidia.large
+    steps:
+      - checkout
+      - run:
+          name: install cudnn + tensorrt + bazel
+          command: |
+            cd ~
+            OS=ubuntu2004
+            CUDNN_VERSION=8.2.1.*-1+cuda11.3
+            TRT_VERSION=8.2.4-1+cuda11.4
+            BAZEL_VERSION=5.1.1
+            
+            wget https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 
+            sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600
+            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/7fa2af80.pub
+            sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 536F8F1DE80F6A35
+            sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC
+            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/ /"
+            sudo apt-get update
+            sudo apt-get install libcudnn8=${CUDNN_VERSION}
+            sudo apt-get install libcudnn8-dev=${CUDNN_VERSION}
+
+            sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/{OS}/x86_64/3bf863cc.pub
+            sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/ /"
+            sudo apt-get update
+             
+            sudo apt-get install libnvinfer8=${TRT_VERSION} libnvonnxparsers8=${TRT_VERSION} libnvparsers8=${TRT_VERSION} libnvinfer-plugin8=${TRT_VERSION} libnvinfer-dev=${TRT_VERSION} libnvonnxparsers-dev=${TRT_VERSION} libnvparsers-dev=${TRT_VERSION} libnvinfer-plugin-dev=${TRT_VERSION} python3-libnvinfer=${TRT_VERSION}
+            # check available version, apt list libnvinfer8 -a
+            sudo wget -q https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-linux-x86_64 -O /usr/bin/bazel
+            sudo chmod a+x /usr/bin/bazel
+
+      - run:
+          name: set up python environment
+          command: |
+            pip3 install nvidia-pyindex
+            pip3 install nvidia-tensorrt==8.2.4.2
+            pip3 install --pre torch==1.13.0.dev20220621  torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113
+            pip3 install pytest parameterized expecttest
+            # install torch_tensorrt
+            mv WORKSPACE.ci WORKSPACE
+            cd py
+            python3 setup.py install
+
+            # install fx2trt
+            # cd py/torch_tensorrt/fx/setup
+            # python3 setup.py install
+      - run:
+          name: run fx2trt tests
+          command: |
+            # one fix pending to enable below
+            # cd py/torch_tensorrt/fx/test
+            # pytest $(find . -name '*.py' | grep -v test_dispatch* | grep -v test_setitem*)
+            
+            cd py/torch_tensorrt/fx/test
+            pushd converters/acc_op
+            pytest 
+            popd
+            pushd passes
+            list_passes=$(ls | grep -v test_setitem*) 
+            pytest $list_passes
+            popd
+            pushd core
+            pytest
+            popd
+            # pushd quant
+            # pytest
+            # popd
+            pushd tools
+            pytest
+            popd
+            pushd trt_lower
+            pytest
+            popd
+            pushd tracer
+            list_tracer=$(ls | grep -v test_dispatch_*) 
+            pytest $list_tracer
+            popd
+# Invoke jobs via workflows
+# See: https://circleci.com/docs/2.0/configuration-reference/#workflows
+workflows:
+  build_run:
+    jobs:
+      - build
diff --git a/.github/code-owners.yml b/.github/code-owners.yml
@@ -121,5 +121,6 @@
 
 "component: fx":
   - "frank-wei"
+  - "yinghai"
   - "842974287"
   - "wushirong"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 ### Developing Torch-TensorRT
 
-Do try to fill an issue with your feature or bug before filling a PR (op support is generally an exception as long as you provide tests to prove functionality). There is also a backlog (https://github.com/NVIDIA/Torch-TensorRT/issues) of issues which are tagged with the area of focus, a coarse priority level and whether the issue may be accessible to new contributors. Let us know if you are interested in working on a issue. We are happy to provide guidance and mentorship for new contributors. Though note, there is no claiming of issues, we prefer getting working code quickly vs. addressing concerns about "wasted work".
+Do try to fill an issue with your feature or bug before filling a PR (op support is generally an exception as long as you provide tests to prove functionality). There is also a backlog (https://github.com/pytorch/TensorRT/issues) of issues which are tagged with the area of focus, a coarse priority level and whether the issue may be accessible to new contributors. Let us know if you are interested in working on a issue. We are happy to provide guidance and mentorship for new contributors. Though note, there is no claiming of issues, we prefer getting working code quickly vs. addressing concerns about "wasted work".
 
 #### Communication
 

diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ These are the following dependencies used to verify the testcases. Torch-TensorR
 
 ## Prebuilt Binaries and Wheel files
 
-Releases: https://github.com/NVIDIA/Torch-TensorRT/releases
+Releases: https://github.com/pytorch/TensorRT/releases
 
 ## Compiling Torch-TensorRT
 
@@ -212,6 +212,12 @@ new_local_repository(
 bazel build //:libtorchtrt --compilation_mode opt
 ```
 
+### FX path (Python only) installation
+If the user plan to try FX path (Python only) and would like to avoid bazel build. Please follow the steps below.
+``` shell
+cd py && python3 setup.py install --fx-only
+```
+
 ### Debug build
 
 ``` shell
@@ -291,7 +297,7 @@ Supported Python versions:
 
 ### In Torch-TensorRT?
 
-Thanks for wanting to contribute! There are two main ways to handle supporting a new op. Either you can write a converter for the op from scratch and register it in the NodeConverterRegistry or if you can map the op to a set of ops that already have converters you can write a graph rewrite pass which will replace your new op with an equivalent subgraph of supported ops. Its preferred to use graph rewriting because then we do not need to maintain a large library of op converters. Also do look at the various op support trackers in the [issues](https://github.com/NVIDIA/Torch-TensorRT/issues) for information on the support status of various operators.
+Thanks for wanting to contribute! There are two main ways to handle supporting a new op. Either you can write a converter for the op from scratch and register it in the NodeConverterRegistry or if you can map the op to a set of ops that already have converters you can write a graph rewrite pass which will replace your new op with an equivalent subgraph of supported ops. Its preferred to use graph rewriting because then we do not need to maintain a large library of op converters. Also do look at the various op support trackers in the [issues](https://github.com/pytorch/TensorRT/issues) for information on the support status of various operators.
 
 ### In my application?
 

diff --git a/WORKSPACE.ci b/WORKSPACE.ci
@@ -0,0 +1,147 @@
+workspace(name = "Torch-TensorRT")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+
+http_archive(
+    name = "rules_python",
+    sha256 = "778197e26c5fbeb07ac2a2c5ae405b30f6cb7ad1f5510ea6fdac03bded96cc6f",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.2.0/rules_python-0.2.0.tar.gz",
+)
+
+load("@rules_python//python:pip.bzl", "pip_install")
+
+http_archive(
+    name = "rules_pkg",
+    sha256 = "038f1caa773a7e35b3663865ffb003169c6a71dc995e39bf4815792f385d837d",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.4.0/rules_pkg-0.4.0.tar.gz",
+        "https://github.com/bazelbuild/rules_pkg/releases/download/0.4.0/rules_pkg-0.4.0.tar.gz",
+    ],
+)
+
+load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+
+rules_pkg_dependencies()
+
+git_repository(
+    name = "googletest",
+    commit = "703bd9caab50b139428cea1aaff9974ebee5742e",
+    remote = "https://github.com/google/googletest",
+    shallow_since = "1570114335 -0400",
+)
+
+# External dependency for torch_tensorrt if you already have precompiled binaries.
+local_repository(
+    name = "torch_tensorrt",
+    path = "/opt/conda/lib/python3.8/site-packages/torch_tensorrt"
+)
+
+# CUDA should be installed on the system locally
+new_local_repository(
+    name = "cuda",
+    build_file = "@//third_party/cuda:BUILD",
+    path = "/usr/local/cuda/",
+)
+
+new_local_repository(
+    name = "cublas",
+    build_file = "@//third_party/cublas:BUILD",
+    path = "/usr",
+)
+#############################################################################################################
+# Tarballs and fetched dependencies (default - use in cases when building from precompiled bin and tarballs)
+#############################################################################################################
+
+#http_archive(
+#    name = "libtorch",
+#    build_file = "@//third_party/libtorch:BUILD",
+#    sha256 = "8d9e829ce9478db4f35bdb7943308cf02e8a2f58cf9bb10f742462c1d57bf287",
+#    strip_prefix = "libtorch",
+#    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcu113.zip"],
+#)
+#
+#http_archive(
+#    name = "libtorch_pre_cxx11_abi",
+#    build_file = "@//third_party/libtorch:BUILD",
+#    sha256 = "90159ecce3ff451f3ef3f657493b6c7c96759c3b74bbd70c1695f2ea2f81e1ad",
+#    strip_prefix = "libtorch",
+#    urls = ["https://download.pytorch.org/libtorch/cu113/libtorch-shared-with-deps-1.11.0%2Bcu113.zip"],
+#)
+
+# Download these tarballs manually from the NVIDIA website
+# Either place them in the distdir directory in third_party and use the --distdir flag
+# or modify the urls to "file:///<PATH TO TARBALL>/<TARBALL NAME>.tar.gz
+
+#http_archive(
+#    name = "cudnn",
+#    build_file = "@//third_party/cudnn/archive:BUILD",
+#    sha256 = "0e5d2df890b9967efa6619da421310d97323565a79f05a1a8cb9b7165baad0d7",
+#    strip_prefix = "cuda",
+#    urls = [
+#        "https://developer.nvidia.com/compute/machine-learning/cudnn/secure/8.2.4/11.4_20210831/cudnn-11.4-linux-x64-v8.2.4.15.tgz",
+#    ],
+#)
+#
+#http_archive(
+#    name = "tensorrt",
+#    build_file = "@//third_party/tensorrt/archive:BUILD",
+#    sha256 = "826180eaaecdf9a7e76116855b9f1f3400ea9b06e66b06a3f6a0747ba6f863ad",
+#    strip_prefix = "TensorRT-8.2.4.2",
+#    urls = [
+#        "https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.2.4/tars/tensorrt-8.2.4.2.linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz",
+#    ],
+#)
+
+####################################################################################
+# Locally installed dependencies (use in cases of custom dependencies or aarch64)
+####################################################################################
+
+# NOTE: In the case you are using just the pre-cxx11-abi path or just the cxx11 abi path
+# with your local libtorch, just point deps at the same path to satisfy bazel.
+
+# NOTE: NVIDIA's aarch64 PyTorch (python) wheel file uses the CXX11 ABI unlike PyTorch's standard
+# x86_64 python distribution. If using NVIDIA's version just point to the root of the package
+# for both versions here and do not use --config=pre-cxx11-abi
+
+new_local_repository(
+    name = "libtorch",
+    path = "/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages/torch",
+    build_file = "third_party/libtorch/BUILD"
+)
+
+new_local_repository(
+    name = "libtorch_pre_cxx11_abi",
+    path = "/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages/torch",
+    build_file = "third_party/libtorch/BUILD"
+)
+
+new_local_repository(
+    name = "cudnn",
+    path = "/usr/",
+    build_file = "@//third_party/cudnn/local:BUILD"
+)
+
+new_local_repository(
+   name = "tensorrt",
+   path = "/usr/",
+   build_file = "@//third_party/tensorrt/local:BUILD"
+)
+
+# #########################################################################
+# # Testing Dependencies (optional - comment out on aarch64)
+# #########################################################################
+# pip_install(
+#     name = "torch_tensorrt_py_deps",
+#     requirements = "//py:requirements.txt",
+# )
+
+# pip_install(
+#     name = "py_test_deps",
+#     requirements = "//tests/py:requirements.txt",
+# )
+
+pip_install(
+    name = "pylinter_deps",
+    requirements = "//tools/linter:requirements.txt",
+)
diff --git a/core/compiler.cpp b/core/compiler.cpp
@@ -198,7 +198,8 @@ void AddIfBlockToGraph(
 
     auto env = [&](torch::jit::Value* v) { return util::getOrAddInputForValue(v, new_g, block_graph_to_new_g); };
     new_if_block->cloneFrom(cur_block_graph->block(), env);
-    if (cur_block_graph->inputs()[0]->type()->str().find("__torch__") != std::string::npos) {
+    if (cur_block_graph->inputs().size() &&
+        cur_block_graph->inputs()[0]->type()->str().find("__torch__") != std::string::npos) {
       if (new_g->inputs()[0]->type()->str().find("__torch__") == std::string::npos) {
         auto self = new_g->insertInput(0, "self_1");
         self->setType(cur_block_graph->inputs()[0]->type());
@@ -223,13 +224,14 @@ GraphAndMapping ConstructFallbackGraph(
     torch::jit::Block* block,
     std::unordered_map<const torch::jit::Value*, torch::jit::IValue> example_tensor_map,
     CompileSpec cfg,
-    ir::StaticParams static_params) {
+    ir::StaticParams static_params,
+    std::unordered_map<torch::jit::Node*, int>& fallback_nodes) {
   auto convert_cfg = cfg.convert_info;
   auto partition_info = cfg.partition_info;
 
   auto new_g = std::make_shared<torch::jit::Graph>();
 
-  auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info);
+  auto segmented_blocks = partitioning::Partition(block, example_tensor_map, partition_info, fallback_nodes);
 
   // the mapping from lowering graph => fallback global graph
   std::unordered_map<torch::jit::Value*, torch::jit::Value*> old_to_new_g;
@@ -270,7 +272,7 @@ GraphAndMapping ConstructFallbackGraph(
         std::vector<GraphAndMapping> graph_and_mappings;
         for (auto cur_block : if_node->blocks()) {
           graph_and_mappings.push_back(
-              ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params));
+              ConstructFallbackGraph(new_mod, cur_block, example_tensor_map, cfg, static_params, fallback_nodes));
         }
         AddIfBlockToGraph(new_g, if_node, graph_and_mappings, old_to_new_g);
 
@@ -293,7 +295,7 @@ GraphAndMapping ConstructFallbackGraph(
     // Set the output as the produced tuple
     new_g->registerOutput(return_tuple_node->outputs()[0]);
   } else {
-    if (old_to_new_g.count(block->outputs()[0])) {
+    if (block->outputs().size() && old_to_new_g.count(block->outputs()[0])) {
       new_g->registerOutput(old_to_new_g[block->outputs()[0]]);
     }
   }
@@ -430,7 +432,9 @@ torch::jit::Module CompileGraph(const torch::jit::Module& mod, CompileSpec cfg)
           !(cfg.lower_info.forced_fallback_modules.size() == 0 &&
             cfg.partition_info.forced_fallback_operators.size() == 0 && isBlockConvertible)) {
         auto input_ivalues_map = partitioning::generateRandomInputs(cfg.convert_info.inputs, first_use_types);
-        auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params);
+        std::unordered_map<torch::jit::Node*, int> fallback_nodes;
+        auto graph_and_mapping =
+            ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, static_params, fallback_nodes);
         new_g = graph_and_mapping.first;
         LOG_INFO("Segmented Graph: " << *new_g);
 

diff --git a/core/conversion/converters/BUILD b/core/conversion/converters/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "NodeConverterRegistry.cpp",
         "impl/activation.cpp",
         "impl/batch_norm.cpp",
+        "impl/bitwise.cpp",
         "impl/cast.cpp",
         "impl/concat.cpp",
         "impl/constant.cpp",