From e5f96d976cdbdb9ec61fe25f37a81bce821b78aa Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 21 Oct 2021 10:47:08 -0700 Subject: [PATCH] feat: Update documentation with new library name Torch-TensorRT Signed-off-by: Dheeraj Peri Signed-off-by: Dheeraj Peri Signed-off-by: Dheeraj Peri Signed-off-by: Dheeraj Peri --- docsrc/Makefile | 10 +- docsrc/RELEASE_CHECKLIST.md | 18 +- docsrc/conf.py | 40 +- docsrc/contributors/conversion.rst | 4 +- docsrc/contributors/lowering.rst | 20 +- docsrc/contributors/phases.rst | 4 +- docsrc/contributors/runtime.rst | 8 +- docsrc/contributors/system_overview.rst | 6 +- docsrc/contributors/useful_links.rst | 3 +- docsrc/contributors/writing_converters.rst | 6 +- docsrc/index.rst | 31 +- docsrc/py_api/logging.rst | 8 +- .../{trtorch.rst => torch_tensorrt.rst} | 15 +- docsrc/py_api/torch_tensorrt_ts.rst | 23 + .../creating_torchscript_module_in_python.rst | 138 +++++ docsrc/tutorials/getting_started.rst | 531 ------------------ .../getting_started_with_cpp_api.rst | 338 +++++++++++ .../getting_started_with_python_api.rst | 47 ++ docsrc/tutorials/installation.rst | 34 +- docsrc/tutorials/ptq.rst | 50 +- docsrc/tutorials/runtime.rst | 28 +- .../tutorials/{trtorchc.rst => torchtrtc.rst} | 18 +- docsrc/tutorials/use_from_pytorch.rst | 25 +- docsrc/tutorials/using_dla.rst | 16 +- .../BUILD | 0 .../Makefile | 0 .../README.md | 0 .../deps/.gitkeep | 0 .../main.cpp | 0 .../network.py | 0 py/torch_tensorrt/_compile.py | 2 +- py/torch_tensorrt/_version.py | 2 +- 32 files changed, 716 insertions(+), 709 deletions(-) rename docsrc/py_api/{trtorch.rst => torch_tensorrt.rst} (76%) create mode 100644 docsrc/py_api/torch_tensorrt_ts.rst create mode 100644 docsrc/tutorials/creating_torchscript_module_in_python.rst delete mode 100644 docsrc/tutorials/getting_started.rst create mode 100644 docsrc/tutorials/getting_started_with_cpp_api.rst create mode 100644 docsrc/tutorials/getting_started_with_python_api.rst rename docsrc/tutorials/{trtorchc.rst => torchtrtc.rst} (92%) rename examples/{trtorchrt_example => torchtrt_example}/BUILD (100%) rename examples/{trtorchrt_example => torchtrt_example}/Makefile (100%) rename examples/{trtorchrt_example => torchtrt_example}/README.md (100%) rename examples/{trtorchrt_example => torchtrt_example}/deps/.gitkeep (100%) rename examples/{trtorchrt_example => torchtrt_example}/main.cpp (100%) rename examples/{trtorchrt_example => torchtrt_example}/network.py (100%) diff --git a/docsrc/Makefile b/docsrc/Makefile index cc21b87e63..18c52c578b 100644 --- a/docsrc/Makefile +++ b/docsrc/Makefile @@ -21,14 +21,14 @@ check_clean: clean: check_clean rm -rf $(BUILDDIR)/* ifndef VERSION - rm -rf /tmp/trtorch_docs - mkdir -p /tmp/trtorch_docs - mv $(DESTDIR)/v* /tmp/trtorch_docs + rm -rf /tmp/torchtrt_docs + mkdir -p /tmp/torchtrt_docs + mv $(DESTDIR)/v* /tmp/torchtrt_docs endif rm -r $(DESTDIR)/* ifndef VERSION - mv /tmp/trtorch_docs/v* $(DESTDIR) - rm -rf /tmp/trtorch_docs + mv /tmp/torchtrt_docs/v* $(DESTDIR) + rm -rf /tmp/torchtrt_docs endif rm -rf $(SOURCEDIR)/_cpp_api rm -rf $(SOURCEDIR)/_notebooks diff --git a/docsrc/RELEASE_CHECKLIST.md b/docsrc/RELEASE_CHECKLIST.md index 6c59f1bcff..77bb973487 100644 --- a/docsrc/RELEASE_CHECKLIST.md +++ b/docsrc/RELEASE_CHECKLIST.md @@ -1,14 +1,14 @@ # Release Process -Here is the process we use for creating new releases of TRTorch +Here is the process we use for creating new releases of Torch-TensorRT ## Criteria for Release -While TRTorch is in alpha, patch versions are bumped sequentially on breaking changes in the compiler. +While Torch-TensorRT is in alpha, patch versions are bumped sequentially on breaking changes in the compiler. -In beta TRTorch will get a minor version bump on breaking changes, or upgrade to the next version of PyTorch, patch version will be incremented based on significant bug fixes, or siginficant new functionality in the compiler. +In beta Torch-TensorRT will get a minor version bump on breaking changes, or upgrade to the next version of PyTorch, patch version will be incremented based on significant bug fixes, or siginficant new functionality in the compiler. -Once TRTorch hits version 1.0.0, major versions are bumped on breaking API changes, breaking changes or significant new functionality in the compiler +Once Torch-TensorRT hits version 1.0.0, major versions are bumped on breaking API changes, breaking changes or significant new functionality in the compiler will result in a minor version bump and sigificant bug fixes will result in a patch version change. ## Steps to Packaging a Release @@ -20,7 +20,7 @@ will result in a minor version bump and sigificant bug fixes will result in a pa - Required, Python API and Optional Tests should pass on both x86_64 and aarch64 - All checked in applications (cpp and python) should compile and work 3. Generate new index of converters and evalutators - - `bazel run //tools/supportedops -- /docsrc/indices/supported_ops.rst` + - `bazel run //tools/supportedops -- /docsrc/indices/supported_ops.rst` 4. Version bump PR - There should be a PR which will be the PR that bumps the actual version of the library, this PR should contain the following - Bump version in `py/setup.py` @@ -49,7 +49,7 @@ will result in a minor version bump and sigificant bug fixes will result in a pa - `[3, 224, 224]` - `[3, 1920, 1080]` (P2) - Batch Sizes: 1, 4, 8, 16, 32 - - Frameworks: PyTorch, TRTorch, ONNX + TRT + - Frameworks: PyTorch, Torch-TensorRT, ONNX + TRT - If any models do not convert to ONNX / TRT, that is fine. Mark them as failling / no result - Devices: - A100 (P0) @@ -61,11 +61,11 @@ will result in a minor version bump and sigificant bug fixes will result in a pa 6. Once PR is merged tag commit and start creating release on GitHub - Paste in Milestone information and Changelog information into release notes - - Generate libtrtorch.tar.gz for the following platforms: + - Generate libtorchtrt.tar.gz for the following platforms: - x86_64 cxx11-abi - x86_64 pre-cxx11-abi - TODO: Add cxx11-abi build for aarch64 when a manylinux container for aarch64 exists - Generate Python packages for Python 3.6/3.7/3.8/3.9 for x86_64 - TODO: Build a manylinux container for aarch64 - - `docker run -it -v$(pwd)/..:/workspace/TRTorch build_trtorch_wheel /bin/bash /workspace/TRTorch/py/build_whl.sh` generates all wheels - - To build container `docker build -t build_trtorch_wheel .` + - `docker run -it -v$(pwd)/..:/workspace/Torch-TensorRT build_torch_tensorrt_wheel /bin/bash /workspace/Torch-TensorRT/py/build_whl.sh` generates all wheels + - To build container `docker build -t build_torch_tensorrt_wheel .` diff --git a/docsrc/conf.py b/docsrc/conf.py index 7760cb2c40..dbd26bf4e5 100644 --- a/docsrc/conf.py +++ b/docsrc/conf.py @@ -10,7 +10,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import os +import os import sys sys.path.append(os.path.join(os.path.dirname(__name__), '../py')) @@ -18,7 +18,7 @@ import sphinx_material # -- Project information ----------------------------------------------------- -project = 'TRTorch' +project = 'Torch-TensorRT' copyright = '2021, NVIDIA Corporation' author = 'NVIDIA Corporation' @@ -63,15 +63,15 @@ html_static_path = ['_static'] # Setup the breathe extension -breathe_projects = {"TRTorch": "./_tmp/xml"} -breathe_default_project = "TRTorch" +breathe_projects = {"Torch-TensorRT": "./_tmp/xml"} +breathe_default_project = "Torch-TensorRT" # Setup the exhale extension exhale_args = { # These arguments are required "containmentFolder": "./_cpp_api", - "rootFileName": "trtorch_cpp.rst", - "rootFileTitle": "TRTorch C++ API", + "rootFileName": "torch_tensort_cpp.rst", + "rootFileTitle": "Torch-TensorRT C++ API", "doxygenStripFromPath": "..", # Suggested optional arguments "createTreeView": True, @@ -92,10 +92,10 @@ # Material theme options (see theme.conf for more information) html_theme_options = { # Set the name of the project to appear in the navigation. - 'nav_title': 'TRTorch', + 'nav_title': 'Torch-TensorRT', # Specify a base_url used to generate sitemap.xml. If not # specified, then no sitemap will be built. - 'base_url': 'https://nvidia.github.io/TRTorch/', + 'base_url': 'https://nvidia.github.io/Torch-TensorRT/', # Set the color and the accent color 'theme_color': '84bd00', @@ -107,8 +107,8 @@ "logo_icon": "", # Set the repo location to get a badge with stats - 'repo_url': 'https://github.com/nvidia/TRTorch/', - 'repo_name': 'TRTorch', + 'repo_url': 'https://github.com/nvidia/Torch-TensorRT/', + 'repo_name': 'Torch-TensorRT', # Visible levels of the global TOC; -1 means unlimited 'globaltoc_depth': 1, @@ -118,21 +118,21 @@ 'globaltoc_includehidden': True, 'master_doc': True, "version_info": { - "master": "https://nvidia.github.io/TRTorch/", - "v0.4.1": "https://nvidia.github.io/TRTorch/v0.4.1/", - "v0.4.0": "https://nvidia.github.io/TRTorch/v0.4.0/", - "v0.3.0": "https://nvidia.github.io/TRTorch/v0.3.0/", - "v0.2.0": "https://nvidia.github.io/TRTorch/v0.2.0/", - "v0.1.0": "https://nvidia.github.io/TRTorch/v0.1.0/", - "v0.0.3": "https://nvidia.github.io/TRTorch/v0.0.3/", - "v0.0.2": "https://nvidia.github.io/TRTorch/v0.0.2/", - "v0.0.1": "https://nvidia.github.io/TRTorch/v0.0.1/", + "master": "https://nvidia.github.io/Torch-TensorRT/", + "v0.4.1": "https://nvidia.github.io/Torch-TensorRT/v0.4.1/", + "v0.4.0": "https://nvidia.github.io/Torch-TensorRT/v0.4.0/", + "v0.3.0": "https://nvidia.github.io/Torch-TensorRT/v0.3.0/", + "v0.2.0": "https://nvidia.github.io/Torch-TensorRT/v0.2.0/", + "v0.1.0": "https://nvidia.github.io/Torch-TensorRT/v0.1.0/", + "v0.0.3": "https://nvidia.github.io/Torch-TensorRT/v0.0.3/", + "v0.0.2": "https://nvidia.github.io/Torch-TensorRT/v0.0.2/", + "v0.0.1": "https://nvidia.github.io/Torch-TensorRT/v0.0.1/", } } # Tell sphinx what the primary language being documented is. primary_domain = 'cpp' -cpp_id_attributes = ["TRTORCH_API"] +cpp_id_attributes = ["TORCHTRT_API"] # Tell sphinx what the pygments highlight language should be. highlight_language = 'cpp' diff --git a/docsrc/contributors/conversion.rst b/docsrc/contributors/conversion.rst index 155e07cbb3..deb6d85a49 100644 --- a/docsrc/contributors/conversion.rst +++ b/docsrc/contributors/conversion.rst @@ -32,7 +32,7 @@ inputs and assemble an array of resources to pass to the converter. Inputs can b static value has been evaluated * The input is from a node that has not been converted - * TRTorch will error out here + * Torch-TensorRT will error out here Node Evaluation ----------------- @@ -49,4 +49,4 @@ Node converters map JIT nodes to layers or subgraphs of layers. They then associ and the TRT graph together in the conversion context. This allows the conversion stage to assemble the inputs for the next node. There are some cases where a node produces an output that is not a Tensor but a static result from a calculation done on inputs which need to be converted first. In this case the converter may associate the outputs in -the ``evaluated_value_map`` instead of the ``value_tensor_map``. For more information take a look at: :ref:`writing_converters` \ No newline at end of file +the ``evaluated_value_map`` instead of the ``value_tensor_map``. For more information take a look at: :ref:`writing_converters` diff --git a/docsrc/contributors/lowering.rst b/docsrc/contributors/lowering.rst index 669664efbf..7208e9c4bf 100644 --- a/docsrc/contributors/lowering.rst +++ b/docsrc/contributors/lowering.rst @@ -33,7 +33,7 @@ Dead code elimination will check if a node has side effects and not delete it if Eliminate Exeception Or Pass Pattern *************************************** - `trtorch/core/lowering/passes/exception_elimination.cpp `_ + `Torch-TensorRT/core/lowering/passes/exception_elimination.cpp `_ A common pattern in scripted modules are dimension gaurds which will throw execptions if the input dimension is not what was expected. @@ -68,7 +68,7 @@ Freeze attributes and inline constants and modules. Propogates constants in the Fuse AddMM Branches *************************************** - `trtorch/core/lowering/passes/fuse_addmm_branches.cpp `_ + `Torch-TensorRT/core/lowering/passes/fuse_addmm_branches.cpp `_ A common pattern in scripted modules is tensors of different dimensions use different constructions for implementing linear layers. We fuse these different varients into a single one that will get caught by the Unpack AddMM pass. @@ -101,7 +101,7 @@ This pass fuse the addmm or matmul + add generated by JIT back to linear Fuse Flatten Linear *************************************** - `trtorch/core/lowering/passes/fuse_flatten_linear.cpp `_ + `Torch-TensorRT/core/lowering/passes/fuse_flatten_linear.cpp `_ TensorRT implicity flattens input layers into fully connected layers when they are higher than 1D. So when there is a ``aten::flatten`` -> ``aten::linear`` pattern we remove the ``aten::flatten``. @@ -134,7 +134,7 @@ Removes _all_ tuples and raises an error if some cannot be removed, this is used Module Fallback ***************** - `trtorch/core/lowering/passes/module_fallback.cpp ` + `Torch-TensorRT/core/lowering/passes/module_fallback.cpp ` Module fallback consists of two lowering passes that must be run as a pair. The first pass is run before freezing to place delimiters in the graph around modules that should run in PyTorch. The second pass marks nodes between these delimiters after freezing to signify they should run in PyTorch. @@ -162,7 +162,7 @@ Right now, it does: Remove Contiguous *************************************** - `trtorch/core/lowering/passes/remove_contiguous.cpp `_ + `Torch-TensorRT/core/lowering/passes/remove_contiguous.cpp `_ Removes contiguous operators since we are doing TensorRT memory is already contiguous. @@ -170,14 +170,14 @@ Removes contiguous operators since we are doing TensorRT memory is already conti Remove Dropout *************************************** - `trtorch/core/lowering/passes/remove_dropout.cpp `_ + `Torch-TensorRT/core/lowering/passes/remove_dropout.cpp `_ Removes dropout operators since we are doing inference. Remove To *************************************** - `trtorch/core/lowering/passes/remove_to.cpp `_ + `Torch-TensorRT/core/lowering/passes/remove_to.cpp `_ Removes ``aten::to`` operators that do casting, since TensorRT mangages it itself. It is important that this is one of the last passes run so that other passes have a change to move required cast operators out of the main namespace. @@ -185,7 +185,7 @@ other passes have a change to move required cast operators out of the main names Unpack AddMM *************************************** - `trtorch/core/lowering/passes/unpack_addmm.cpp `_ + `Torch-TensorRT/core/lowering/passes/unpack_addmm.cpp `_ Unpacks ``aten::addmm`` into ``aten::matmul`` and ``aten::add_`` (with an additional ``trt::const`` op to freeze the bias in the TensorRT graph). This lets us reuse the ``aten::matmul`` and ``aten::add_`` @@ -194,7 +194,7 @@ converters instead of needing a dedicated converter. Unpack LogSoftmax *************************************** - `trtorch/core/lowering/passes/unpack_log_softmax.cpp `_ + `Torch-TensorRT/core/lowering/passes/unpack_log_softmax.cpp `_ Unpacks ``aten::logsoftmax`` into ``aten::softmax`` and ``aten::log``. This lets us reuse the ``aten::softmax`` and ``aten::log`` converters instead of needing a dedicated converter. @@ -204,4 +204,4 @@ Unroll Loops `torch/csrc/jit/passes/loop_unrolling.h `_ -Unrolls the operations of compatable loops (e.g. sufficently short) so that you only have to go through the loop once. \ No newline at end of file +Unrolls the operations of compatable loops (e.g. sufficently short) so that you only have to go through the loop once. diff --git a/docsrc/contributors/phases.rst b/docsrc/contributors/phases.rst index 996714af14..b654bdc569 100644 --- a/docsrc/contributors/phases.rst +++ b/docsrc/contributors/phases.rst @@ -15,7 +15,7 @@ Lowering ^^^^^^^^^^^ :ref:`lowering` -The lowering is made up of a set of passes (some from PyTorch and some specific to TRTorch) +The lowering is made up of a set of passes (some from PyTorch and some specific to Torch-TensorRT) run over the graph IR to map the large PyTorch opset to a reduced opset that is easier to convert to TensorRT. @@ -43,4 +43,4 @@ Compilation and Runtime The final compilation phase constructs a TorchScript program to run the converted TensorRT engine. It takes a serialized engine and instantiates it within a engine manager, then the compiler will build out a JIT graph that references this engine and wraps it in a module to return to the user. -When the user executes the module, the JIT program run in the JIT runtime extended by TRTorch with the data providied from the user. \ No newline at end of file +When the user executes the module, the JIT program run in the JIT runtime extended by Torch-TensorRT with the data providied from the user. diff --git a/docsrc/contributors/runtime.rst b/docsrc/contributors/runtime.rst index ed5f3a7242..23d83b6db2 100644 --- a/docsrc/contributors/runtime.rst +++ b/docsrc/contributors/runtime.rst @@ -21,7 +21,7 @@ torch::jit::Value type). TensorRT Engine Executor Op ---------------------------- -When the TRTorch is loaded, it registers an operator in the PyTorch JIT operator library called +When the Torch-TensorRT is loaded, it registers an operator in the PyTorch JIT operator library called ``trt::execute_engine(Tensor[] inputs, __torch__.torch.classes.tensorrt.Engine engine) -> Tensor[]`` which takes an instantiated engine and list of inputs. Compiled graphs store this engine in an attribute so that it is portable and serializable. When the op is called, an instnantiated engine and input tensors are popped off the runtime stack. These inputs are passed into a generic engine execution function which @@ -72,8 +72,8 @@ execution. ABI Versioning and Serialization Format ========================================= -TRTorch programs are standard TorchScript with TensorRT engines as objects embedded in the graph. Therefore there is a serialization format -for the TensorRT engines. The format for TRTorch serialized programs are versioned with an "ABI" version which tells the runtime about runtime compatibility. +Torch-TensorRT programs are standard TorchScript with TensorRT engines as objects embedded in the graph. Therefore there is a serialization format +for the TensorRT engines. The format for Torch-TensorRT serialized programs are versioned with an "ABI" version which tells the runtime about runtime compatibility. > Current ABI version is 3 @@ -82,4 +82,4 @@ The format is a vector of serialized strings. They encode the following informat * ABI Version for the program * Name of the TRT engine * Device information: Includes the target device the engine was built on, SM capability and other device information. This information is used at deserialization time to select the correct device to run the engine -* Serialized TensorRT engine \ No newline at end of file +* Serialized TensorRT engine diff --git a/docsrc/contributors/system_overview.rst b/docsrc/contributors/system_overview.rst index 75c60501ab..fb552e0101 100644 --- a/docsrc/contributors/system_overview.rst +++ b/docsrc/contributors/system_overview.rst @@ -3,7 +3,7 @@ System Overview ================ -TRTorch is primarily a C++ Library with a Python API planned. We use Bazel as our build system and target Linux x86_64 and +Torch-TensorRT is primarily a C++ Library with a Python API planned. We use Bazel as our build system and target Linux x86_64 and Linux aarch64 (only natively) right now. The compiler we use is GCC 7.5.0 and the library is untested with compilers before that version so there may be compilation errors if you try to use an older compiler. @@ -13,7 +13,7 @@ The repository is structured into: * cpp: C++ API * tests: tests of the C++ API, the core and converters * py: Python API -* notebooks: Example applications built with TRTorch +* notebooks: Example applications built with Torch-TensorRT * docs: Documentation * docsrc: Documentation Source * third_party: BUILD files for dependency libraries @@ -26,4 +26,4 @@ The core has a couple major parts: The top level compiler interface which coordi converting and generating a new module and returning it back to the user. The there are the three main phases of the compiler, the lowering phase, the conversion phase, and the execution phase. -.. include:: phases.rst \ No newline at end of file +.. include:: phases.rst diff --git a/docsrc/contributors/useful_links.rst b/docsrc/contributors/useful_links.rst index fb5bfa95d9..0d3719a6a6 100644 --- a/docsrc/contributors/useful_links.rst +++ b/docsrc/contributors/useful_links.rst @@ -1,6 +1,6 @@ .. _useful_links: -Useful Links for TRTorch Development +Useful Links for Torch-TensorRT Development ===================================== TensorRT Available Layers and Expected Dimensions @@ -32,4 +32,3 @@ PyTorch IR Documentation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/OVERVIEW.md - diff --git a/docsrc/contributors/writing_converters.rst b/docsrc/contributors/writing_converters.rst index 251f39d882..990c4dc77d 100644 --- a/docsrc/contributors/writing_converters.rst +++ b/docsrc/contributors/writing_converters.rst @@ -27,7 +27,7 @@ which will do the actual conversion: .. code-block:: c++ - auto acthardtanh TRTORCH_UNUSED = RegisterNodeConversionPatterns() + auto acthardtanh TORCHTRT_UNUSED = RegisterNodeConversionPatterns() .pattern({ "aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> (Tensor)", [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool { @@ -36,7 +36,7 @@ which will do the actual conversion: auto max = args[2].unwrapToDouble(); auto new_layer = ctx->net->addActivation(*in, nvinfer1::ActivationType::kCLIP); - TRTORCH_CHECK(new_layer, "Unable to create layer for aten::hardtanh"); + TORCHTRT_CHECK(new_layer, "Unable to create layer for aten::hardtanh"); new_layer->setAlpha(min); new_layer->setBeta(max); @@ -126,4 +126,4 @@ Other advice You have the benefit of the full aten library when dealing with weights and other static values. This means that you can do quite a bit of work during conversion time to produce efficient conversion. A good example is batch_norm -converter where the converter does fusion of operations with PyTorch before creating the TensorRT layer. \ No newline at end of file +converter where the converter does fusion of operations with PyTorch before creating the TensorRT layer. diff --git a/docsrc/index.rst b/docsrc/index.rst index 770b8f7ec0..a9900895c7 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -1,16 +1,16 @@ -.. TRTorch documentation master file, created by +.. Torch-TensorRT documentation master file, created by sphinx-quickstart on Mon May 4 13:43:16 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -TRTorch -======== +Torch-TensorRT +============== Ahead-of-time compilation of TorchScript / PyTorch JIT for NVIDIA GPUs ----------------------------------------------------------------------- -TRTorch is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. -Unlike PyTorch's Just-In-Time (JIT) compiler, TRTorch is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your +Torch-TensorRT is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. +Unlike PyTorch's Just-In-Time (JIT) compiler, Torch-TensorRT is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting -a TensorRT engine. TRTorch operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. +a TensorRT engine. Torch-TensorRT operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/FP16/INT8) and other settings for your module. @@ -24,7 +24,7 @@ Getting Started * :ref:`installation` * :ref:`getting_started` * :ref:`ptq` -* :ref:`trtorchc` +* :ref:`torchtrtc` * :ref:`use_from_pytorch` * :ref:`runtime` * :ref:`using_dla` @@ -35,9 +35,11 @@ Getting Started :hidden: tutorials/installation - tutorials/getting_started + tutorials/getting_started_with_cpp_api + tutorials/getting_started_with_python_api + tutorials/creating_torchscript_module_in_python tutorials/ptq - tutorials/trtorchc + tutorials/torchtrtc tutorials/use_from_pytorch tutorials/runtime tutorials/using_dla @@ -55,26 +57,27 @@ Getting Started Python API Documenation ------------------------ -* :ref:`trtorch_py` +* :ref:`torch_tensorrt_py` .. toctree:: :caption: Python API Documenation :maxdepth: 0 :hidden: - py_api/trtorch + py_api/torch_tensorrt + py_api/torch_tensorrt_ts py_api/logging C++ API Documenation ---------------------- -* :ref:`namespace_trtorch` +* :ref:`namespace_torch_tensorrt` .. toctree:: :caption: C++ API Documenation :maxdepth: 1 :hidden: - _cpp_api/trtorch_cpp + _cpp_api/torch_tensorrt_cpp Contributor Documentation -------------------------------- @@ -103,5 +106,3 @@ Indices :hidden: indices/supported_ops - - diff --git a/docsrc/py_api/logging.rst b/docsrc/py_api/logging.rst index 1d3fee38d1..685ae929e8 100644 --- a/docsrc/py_api/logging.rst +++ b/docsrc/py_api/logging.rst @@ -1,11 +1,11 @@ -trtorch.logging +torch_tensorrt.logging ---------------------- -.. currentmodule:: trtorch.logging +.. currentmodule:: torch_tensorrt.logging -.. automodule:: trtorch.logging +.. automodule:: torch_tensorrt.logging :members: :undoc-members: :show-inheritance: -.. autoclass:: py trtorch.logging.Level \ No newline at end of file +.. autoclass:: py torch_tensorrt.logging.Level diff --git a/docsrc/py_api/trtorch.rst b/docsrc/py_api/torch_tensorrt.rst similarity index 76% rename from docsrc/py_api/trtorch.rst rename to docsrc/py_api/torch_tensorrt.rst index 058c1afa72..5fd6e8f09a 100644 --- a/docsrc/py_api/trtorch.rst +++ b/docsrc/py_api/torch_tensorrt.rst @@ -1,12 +1,12 @@ -.. _trtorch_py: +.. _torch_tensorrt_py: -.. automodule trtorch +.. automodule torch_tensorrt :undoc-members: -trtorch +torch_tensorrt =============== -.. automodule:: trtorch +.. automodule:: torch_tensorrt :members: :undoc-members: :show-inheritance: @@ -20,16 +20,10 @@ Functions .. autofunction:: convert_method_to_trt_engine -.. autofunction:: check_method_op_support - -.. autofunction:: embed_engine_in_new_module - .. autofunction:: get_build_info .. autofunction:: dump_build_info -.. autofunction:: TensorRTCompileSpec - Classes --------- @@ -59,4 +53,3 @@ Submodules :maxdepth: 1 logging - diff --git a/docsrc/py_api/torch_tensorrt_ts.rst b/docsrc/py_api/torch_tensorrt_ts.rst new file mode 100644 index 0000000000..bd72f471f9 --- /dev/null +++ b/docsrc/py_api/torch_tensorrt_ts.rst @@ -0,0 +1,23 @@ +.. _torch_tensorrt_ts_py: + +.. automodule torch_tensorrt.ts + :undoc-members: + +torch_tensorrt +=============== + +.. automodule:: torch_tensorrt.ts + :members: + :undoc-members: + :show-inheritance: + +Functions +------------ + +.. autofunction:: convert_method_to_trt_engine + +.. autofunction:: check_method_op_support + +.. autofunction:: embed_engine_in_new_module + +.. autofunction:: TensorRTCompileSpec diff --git a/docsrc/tutorials/creating_torchscript_module_in_python.rst b/docsrc/tutorials/creating_torchscript_module_in_python.rst new file mode 100644 index 0000000000..1914d161ff --- /dev/null +++ b/docsrc/tutorials/creating_torchscript_module_in_python.rst @@ -0,0 +1,138 @@ +.. _creating_a_ts_mod: + +Creating a TorchScript Module +------------------------------ + +Once you have a trained model you want to compile with TRTorch, you need to start by converting that model from Python code to TorchScript code. +PyTorch has detailed documentation on how to do this https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html but briefly here is the +here is key background information and the process: + +PyTorch programs are based around ``Module`` s which can be used to compose higher level modules. ``Modules`` contain a constructor to set up the modules, parameters and sub-modules +and a forward function which describes how to use the parameters and submodules when the module is invoked. + +For example, we can define a LeNet module like this: + +.. code-block:: python + :linenos: + + import torch.nn as nn + import torch.nn.functional as F + + class LeNetFeatExtractor(nn.Module): + def __init__(self): + super(LeNetFeatExtractor, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 3) + self.conv2 = nn.Conv2d(6, 16, 3) + + def forward(self, x): + x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) + x = F.max_pool2d(F.relu(self.conv2(x)), 2) + return x + + class LeNetClassifier(nn.Module): + def __init__(self): + super(LeNetClassifier, self).__init__() + self.fc1 = nn.Linear(16 * 6 * 6, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = torch.flatten(x,1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + class LeNet(nn.Module): + def __init__(self): + super(LeNet, self).__init__() + self.feat = LeNetFeatExtractor() + self.classifer = LeNetClassifier() + + def forward(self, x): + x = self.feat(x) + x = self.classifer(x) + return x + +. + + Obviously you may want to consolidate such a simple model into a single module but we can see the composability of PyTorch here + +From here are two pathways for going from PyTorch Python code to TorchScript code: Tracing and Scripting. + +Tracing follows the path of execution when the module is called and records what happens. +To trace an instance of our LeNet module, we can call ``torch.jit.trace`` with an example input. + +.. code-block:: python + + import torch.jit + + model = LeNet() + input_data = torch.empty([1,1,32,32]) + traced_model = torch.jit.trace(model, input_data) + +Scripting actually inspects your code with a compiler and generates an equivalent TorchScript program. The difference is that since tracing +is following the execution of your module, it cannot pick up control flow for instance. By working from the Python code, the compiler can +include these components. We can run the script compiler on our LeNet module by calling ``torch.jit.script`` + +.. code-block:: python + + import torch.jit + + model = LeNet() + script_model = torch.jit.script(model) + +There are reasons to use one path or another, the PyTorch documentation has information on how to choose. From a TRTorch prespective, there is +better support (i.e your module is more likely to compile) for traced modules because it doesn't include all the complexities of a complete +programming language, though both paths supported. + +After scripting or tracing your module, you are given back a TorchScript Module. This contains the code and parameters used to run the module stored +in a intermediate representation that TRTorch can consume. + +Here is what the LeNet traced module IR looks like: + +.. code-block:: none + + graph(%self.1 : __torch__.___torch_mangle_10.LeNet, + %input.1 : Float(1, 1, 32, 32)): + %129 : __torch__.___torch_mangle_9.LeNetClassifier = prim::GetAttr[name="classifer"](%self.1) + %119 : __torch__.___torch_mangle_5.LeNetFeatExtractor = prim::GetAttr[name="feat"](%self.1) + %137 : Tensor = prim::CallMethod[name="forward"](%119, %input.1) + %138 : Tensor = prim::CallMethod[name="forward"](%129, %137) + return (%138) + +and the LeNet scripted module IR: + +.. code-block:: none + + graph(%self : __torch__.LeNet, + %x.1 : Tensor): + %2 : __torch__.LeNetFeatExtractor = prim::GetAttr[name="feat"](%self) + %x.3 : Tensor = prim::CallMethod[name="forward"](%2, %x.1) # x.py:38:12 + %5 : __torch__.LeNetClassifier = prim::GetAttr[name="classifer"](%self) + %x.5 : Tensor = prim::CallMethod[name="forward"](%5, %x.3) # x.py:39:12 + return (%x.5) + +You can see that the IR preserves the module structure we have in our python code. + +.. _ts_in_py: + +Working with TorchScript in Python +----------------------------------- + +TorchScript Modules are run the same way you run normal PyTorch modules. You can run the forward pass using the +``forward`` method or just calling the module ``torch_scirpt_module(in_tensor)`` The JIT compiler will compile +and optimize the module on the fly and then returns the results. + +Saving TorchScript Module to Disk +----------------------------------- + +For either traced or scripted modules, you can save the module to disk with the following command + +.. code-block:: python + + import torch.jit + + model = LeNet() + script_model = torch.jit.script(model) + script_model.save("lenet_scripted.ts") diff --git a/docsrc/tutorials/getting_started.rst b/docsrc/tutorials/getting_started.rst deleted file mode 100644 index e2f6735d3e..0000000000 --- a/docsrc/tutorials/getting_started.rst +++ /dev/null @@ -1,531 +0,0 @@ -.. _getting_started: - -Getting Started -================ - -If you haven't already, aquire a tarball of the library by following the instructions in :ref:`Installation` - -Background -********************* - -.. _creating_a_ts_mod: -Creating a TorchScript Module ------------------------------- - -Once you have a trained model you want to compile with TRTorch, you need to start by converting that model from Python code to TorchScript code. -PyTorch has detailed documentation on how to do this https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html but briefly here is the -here is key background information and the process: - -PyTorch programs are based around ``Module`` s which can be used to compose higher level modules. ``Modules`` contain a constructor to set up the modules, parameters and sub-modules -and a forward function which describes how to use the parameters and submodules when the module is invoked. - -For example, we can define a LeNet module like this: - -.. code-block:: python - :linenos: - - import torch.nn as nn - import torch.nn.functional as F - - class LeNetFeatExtractor(nn.Module): - def __init__(self): - super(LeNetFeatExtractor, self).__init__() - self.conv1 = nn.Conv2d(1, 6, 3) - self.conv2 = nn.Conv2d(6, 16, 3) - - def forward(self, x): - x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) - x = F.max_pool2d(F.relu(self.conv2(x)), 2) - return x - - class LeNetClassifier(nn.Module): - def __init__(self): - super(LeNetClassifier, self).__init__() - self.fc1 = nn.Linear(16 * 6 * 6, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - x = torch.flatten(x,1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - class LeNet(nn.Module): - def __init__(self): - super(LeNet, self).__init__() - self.feat = LeNetFeatExtractor() - self.classifer = LeNetClassifier() - - def forward(self, x): - x = self.feat(x) - x = self.classifer(x) - return x - -. - - Obviously you may want to consolidate such a simple model into a single module but we can see the composability of PyTorch here - -From here are two pathways for going from PyTorch Python code to TorchScript code: Tracing and Scripting. - -Tracing follows the path of execution when the module is called and records what happens. -To trace an instance of our LeNet module, we can call ``torch.jit.trace`` with an example input. - -.. code-block:: python - - import torch.jit - - model = LeNet() - input_data = torch.empty([1,1,32,32]) - traced_model = torch.jit.trace(model, input_data) - -Scripting actually inspects your code with a compiler and generates an equivalent TorchScript program. The difference is that since tracing -is following the execution of your module, it cannot pick up control flow for instance. By working from the Python code, the compiler can -include these components. We can run the script compiler on our LeNet module by calling ``torch.jit.script`` - -.. code-block:: python - - import torch.jit - - model = LeNet() - script_model = torch.jit.script(model) - -There are reasons to use one path or another, the PyTorch documentation has information on how to choose. From a TRTorch prespective, there is -better support (i.e your module is more likely to compile) for traced modules because it doesn't include all the complexities of a complete -programming language, though both paths supported. - -After scripting or tracing your module, you are given back a TorchScript Module. This contains the code and parameters used to run the module stored -in a intermediate representation that TRTorch can consume. - -Here is what the LeNet traced module IR looks like: - -.. code-block:: none - - graph(%self.1 : __torch__.___torch_mangle_10.LeNet, - %input.1 : Float(1, 1, 32, 32)): - %129 : __torch__.___torch_mangle_9.LeNetClassifier = prim::GetAttr[name="classifer"](%self.1) - %119 : __torch__.___torch_mangle_5.LeNetFeatExtractor = prim::GetAttr[name="feat"](%self.1) - %137 : Tensor = prim::CallMethod[name="forward"](%119, %input.1) - %138 : Tensor = prim::CallMethod[name="forward"](%129, %137) - return (%138) - -and the LeNet scripted module IR: - -.. code-block:: none - - graph(%self : __torch__.LeNet, - %x.1 : Tensor): - %2 : __torch__.LeNetFeatExtractor = prim::GetAttr[name="feat"](%self) - %x.3 : Tensor = prim::CallMethod[name="forward"](%2, %x.1) # x.py:38:12 - %5 : __torch__.LeNetClassifier = prim::GetAttr[name="classifer"](%self) - %x.5 : Tensor = prim::CallMethod[name="forward"](%5, %x.3) # x.py:39:12 - return (%x.5) - -You can see that the IR preserves the module structure we have in our python code. - -.. _ts_in_py: - -Working with TorchScript in Python ------------------------------------ - -TorchScript Modules are run the same way you run normal PyTorch modules. You can run the forward pass using the -``forward`` method or just calling the module ``torch_scirpt_module(in_tensor)`` The JIT compiler will compile -and optimize the module on the fly and then returns the results. - -Saving TorchScript Module to Disk ------------------------------------ - -For either traced or scripted modules, you can save the module to disk with the following command - -.. code-block:: python - - import torch.jit - - model = LeNet() - script_model = torch.jit.script(model) - script_model.save("lenet_scripted.ts") - -Using TRTorch -********************* - -Now that there is some understanding of TorchScript and how to use it, we can now complete the pipeline and compile -our TorchScript into TensorRT accelerated TorchScript. Unlike the PyTorch JIT compiler, TRTorch is an Ahead-of-Time -(AOT) compiler. This means that unlike with PyTorch where the JIT compiler compiles from the high level PyTorch IR -to kernel implementation at runtime, modules that are to be compiled with TRTorch are compiled fully before runtime -(consider how you use a C compiler for an analogy). TRTorch has 3 main interfaces for using the compiler. You can -use a CLI application similar to how you may use GCC called ``trtorchc``, or you can embed the compiler in a model -freezing application / pipeline. - -.. _trtorch_quickstart: - -[TRTorch Quickstart] Compiling TorchScript Modules with ``trtorchc`` ---------------------------------------------------------------------- - -An easy way to get started with TRTorch and to check if your model can be supported without extra work is to run it through -``trtorchc``, which supports almost all features of the compiler from the command line including post training quantization -(given a previously created calibration cache). For example we can compile our lenet model by setting our preferred operating -precision and input size. This new TorchScript file can be loaded into Python (note: you need to ``import trtorch`` before loading -these compiled modules because the compiler extends the PyTorch the deserializer and runtime to execute compiled modules). - -.. code-block:: shell - - ❯ trtorchc -p f16 lenet_scripted.ts trt_lenet_scripted.ts "(1,1,32,32)" - - ❯ python3 - Python 3.6.9 (default, Apr 18 2020, 01:56:04) - [GCC 8.4.0] on linux - Type "help", "copyright", "credits" or "license" for more information. - >>> import torch - >>> import trtorch - >>> ts_model = torch.jit.load(“trt_lenet_scripted.ts”) - >>> ts_model(torch.randn((1,1,32,32)).to(“cuda”).half()) - -You can learn more about ``trtorchc`` usage here: :ref:`trtorchc` - -.. _compile_py: - -Compiling with TRTorch in Python ---------------------------------- - -To compile your TorchScript module with TRTorch embedded into Python, all you need to do is provide the module and some compiler settings -to TRTorch and you will be returned an optimized TorchScript module to run or add into another PyTorch module. The -only required setting is the input size or input range which is defined as a list of either list types like ``lists``, ``tuples`` -or PyTorch ``size`` objects or dictionaries of minimum, optimial and maximum sizes. You can also specify settings such as -operating precision for the engine or target device. After compilation you can save the module just like any other module -to load in a deployment application. In order to load a TensorRT/TorchScript module, make sure you first import ``trtorch``. - -.. code-block:: python - - import trtorch - - ... - - script_model.eval() # torch module needs to be in eval (not training) mode - - compile_settings = { - "inputs": [trtorch.Input( - min_shape=[1, 1, 16, 16], - opt_shape=[1, 1, 32, 32], - max_shape=[1, 1, 64, 64], - dtype=torch.half, - ), - ], - "enable_precisions": {torch.float, torch.half} # Run with fp16 - } - - trt_ts_module = trtorch.compile(script_model, compile_settings) - - input_data = input_data.to('cuda').half() - result = trt_ts_module(input_data) - torch.jit.save(trt_ts_module, "trt_ts_module.ts") - -.. code-block:: python - - # Deployment application - import torch - import trtorch - - trt_ts_module = torch.jit.load("trt_ts_module.ts") - input_data = input_data.to('cuda').half() - result = trt_ts_module(input_data) - -.. _ts_in_cc: - -Working with TorchScript in C++ --------------------------------- - -If we are developing an application to deploy with C++, we can save either our traced or scripted module using ``torch.jit.save`` -which will serialize the TorchScript code, weights and other information into a package. This is also where our dependency on Python ends. - -.. code-block:: python - - torch_script_module.save("lenet.jit.pt") - -From here we can now load our TorchScript module in C++ - -.. code-block:: c++ - - #include // One-stop header. - - #include - #include - - int main(int argc, const char* argv[]) { - torch::jit::Module module; - try { - // Deserialize the ScriptModule from a file using torch::jit::load(). - module = torch::jit::load(""); - } - catch (const c10::Error& e) { - std::cerr << "error loading the model\n"; - return -1; - } - - std::cout << "ok\n"; - - -You can do full training and inference in C++ with PyTorch / LibTorch if you would like, you can even define your modules in C++ and -have access to the same powerful tensor library that backs PyTorch. (For more information: https://pytorch.org/cppdocs/). -For instance we can do inference with our LeNet module like this: - -.. code-block:: c++ - - mod.eval(); - torch::Tensor in = torch::randn({1, 1, 32, 32}); - auto out = mod.forward(in); - -and to run on the GPU: - -.. code-block:: c++ - - mod.eval(); - mod.to(torch::kCUDA); - torch::Tensor in = torch::randn({1, 1, 32, 32}, torch::kCUDA); - auto out = mod.forward(in); - -As you can see it is pretty similar to the Python API. When you call the ``forward`` method, you invoke the PyTorch JIT compiler, which will optimize and run your TorchScript code. - -.. _compile_cpp: - -Compiling with TRTorch in C++ ------------------------------- -We are also at the point were we can compile and optimize our module with TRTorch, but instead of in a JIT fashion we must do it ahead-of-time (AOT) i.e. before we start doing actual inference work -since it takes a bit of time to optimize the module, it would not make sense to do this every time you run the module or even the first time you run it. - -With out module loaded, we can feed it into the TRTorch compiler. When we do so we must provide some information on the expected input size and also configure any additional settings. - -.. code-block:: c++ - - #include "torch/script.h" - #include "trtorch/trtorch.h" - ... - - mod.to(at::kCUDA); - mod.eval(); - - auto in = torch::randn({1, 1, 32, 32}, {torch::kCUDA}); - auto trt_mod = trtorch::CompileGraph(mod, std::vector{{in.sizes()}}); - auto out = trt_mod.forward({in}); - -Thats it! Now the graph runs primarily not with the JIT compiler but using TensorRT (though we execute the graph using the JIT runtime). - -We can also set settings like operating precision to run in FP16. - -.. code-block:: c++ - - #include "torch/script.h" - #include "trtorch/trtorch.h" - ... - - mod.to(at::kCUDA); - mod.eval(); - - auto in = torch::randn({1, 1, 32, 32}, {torch::kCUDA}).to(torch::kHALF); - auto input_sizes = std::vector({in.sizes()}); - trtorch::CompileSpec info(input_sizes); - info.enable_precisions.insert(torch::kHALF); - auto trt_mod = trtorch::CompileGraph(mod, info); - auto out = trt_mod.forward({in}); - -And now we are running the module in FP16 precision. You can then save the module to load later. - -.. code-block:: c++ - - trt_mod.save("") - -TRTorch compiled TorchScript modules are loaded in the same way as normal TorchScript module. Make sure your deployment application is linked against ``libtrtorch.so`` - -.. code-block:: c++ - - #include "torch/script.h" - #include "trtorch/trtorch.h" - - int main(int argc, const char* argv[]) { - torch::jit::Module module; - try { - // Deserialize the ScriptModule from a file using torch::jit::load(). - module = torch::jit::load(""); - } - catch (const c10::Error& e) { - std::cerr << "error loading the model\n"; - return -1; - } - - torch::Tensor in = torch::randn({1, 1, 32, 32}, torch::kCUDA); - auto out = mod.forward(in); - - std::cout << "ok\n"; - } - -If you want to save the engine produced by TRTorch to use in a TensorRT application you can use the ``ConvertGraphToTRTEngine`` API. - -.. code-block:: c++ - - #include "torch/script.h" - #include "trtorch/trtorch.h" - ... - - mod.to(at::kCUDA); - mod.eval(); - - auto in = torch::randn({1, 1, 32, 32}, {torch::kCUDA}).to(torch::kHALF); - auto input_sizes = std::vector({in.sizes()}); - trtorch::CompileSpec info(input_sizes); - info.enabled_precisions.insert(torch::kHALF); - auto trt_mod = trtorch::ConvertGraphToTRTEngine(mod, "forward", info); - std::ofstream out("/tmp/engine_converted_from_jit.trt"); - out << engine; - out.close(); - -.. _under_the_hood: - -Under The Hood ---------------- - -When a module is provided to TRTorch, the compiler starts by mapping a graph like you saw above to a graph like this: - -.. code-block:: none - - graph(%input.2 : Tensor): - %2 : Float(84, 10) = prim::Constant[value=]() - %3 : Float(120, 84) = prim::Constant[value=]() - %4 : Float(576, 120) = prim::Constant[value=]() - %5 : int = prim::Constant[value=-1]() # x.py:25:0 - %6 : int[] = prim::Constant[value=annotate(List[int], [])]() - %7 : int[] = prim::Constant[value=[2, 2]]() - %8 : int[] = prim::Constant[value=[0, 0]]() - %9 : int[] = prim::Constant[value=[1, 1]]() - %10 : bool = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 - %11 : int = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 - %12 : bool = prim::Constant[value=0]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 - %self.classifer.fc3.bias : Float(10) = prim::Constant[value= 0.0464 0.0383 0.0678 0.0932 0.1045 -0.0805 -0.0435 -0.0818 0.0208 -0.0358 [ CUDAFloatType{10} ]]() - %self.classifer.fc2.bias : Float(84) = prim::Constant[value=]() - %self.classifer.fc1.bias : Float(120) = prim::Constant[value=]() - %self.feat.conv2.weight : Float(16, 6, 3, 3) = prim::Constant[value=]() - %self.feat.conv2.bias : Float(16) = prim::Constant[value=]() - %self.feat.conv1.weight : Float(6, 1, 3, 3) = prim::Constant[value=]() - %self.feat.conv1.bias : Float(6) = prim::Constant[value= 0.0530 -0.1691 0.2802 0.1502 0.1056 -0.1549 [ CUDAFloatType{6} ]]() - %input0.4 : Tensor = aten::_convolution(%input.2, %self.feat.conv1.weight, %self.feat.conv1.bias, %9, %8, %9, %12, %8, %11, %12, %12, %10) # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 - %input0.5 : Tensor = aten::relu(%input0.4) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 - %input1.2 : Tensor = aten::max_pool2d(%input0.5, %7, %6, %8, %9, %12) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 - %input0.6 : Tensor = aten::_convolution(%input1.2, %self.feat.conv2.weight, %self.feat.conv2.bias, %9, %8, %9, %12, %8, %11, %12, %12, %10) # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 - %input2.1 : Tensor = aten::relu(%input0.6) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 - %x.1 : Tensor = aten::max_pool2d(%input2.1, %7, %6, %8, %9, %12) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 - %input.1 : Tensor = aten::flatten(%x.1, %11, %5) # x.py:25:0 - %27 : Tensor = aten::matmul(%input.1, %4) - %28 : Tensor = trt::const(%self.classifer.fc1.bias) - %29 : Tensor = aten::add_(%28, %27, %11) - %input0.2 : Tensor = aten::relu(%29) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 - %31 : Tensor = aten::matmul(%input0.2, %3) - %32 : Tensor = trt::const(%self.classifer.fc2.bias) - %33 : Tensor = aten::add_(%32, %31, %11) - %input1.1 : Tensor = aten::relu(%33) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 - %35 : Tensor = aten::matmul(%input1.1, %2) - %36 : Tensor = trt::const(%self.classifer.fc3.bias) - %37 : Tensor = aten::add_(%36, %35, %11) - return (%37) - (CompileGraph) - -The graph has now been transformed from a collection of modules, each managing their own parameters into a single graph with the parameters inlined -into the graph and all of the operations laid out. TRTorch has also executed a number of optimizations and mappings to make the graph easier to translate to TensorRT. -From here the compiler can assemble the TensorRT engine by following the dataflow through the graph. - -When the graph construction phase is complete, TRTorch produces a serialized TensorRT engine. From here depending on the API, this engine is returned -to the user or moves into the graph construction phase. Here TRTorch creates a JIT Module to execute the TensorRT engine which will be instantiated and managed -by the TRTorch runtime. - -Here is the graph that you get back after compilation is complete: - -.. code-block:: none - - graph(%self_1 : __torch__.lenet, %input_0 : Tensor): - %1 : ...trt.Engine = prim::GetAttr[name="lenet"](%self_1) - %3 : Tensor[] = prim::ListConstruct(%input_0) - %4 : Tensor[] = trt::execute_engine(%3, %1) - %5 : Tensor = prim::ListUnpack(%4) - return (%5) - - -You can see the call where the engine is executed, after extracting the attribute containing the engine and constructing a list of inputs, then returns the tensors back to the user. - -.. _unsupported_ops: - -Working with Unsupported Operators ------------------------------------ - -TRTorch is a new library and the PyTorch operator library is quite large, so there will be ops that aren't supported natively by the compiler. You can either use the composition techinques -shown above to make modules are fully TRTorch supported and ones that are not and stitch the modules together in the deployment application or you can register converters for missing ops. - - You can check support without going through the full compilation pipleine using the ``trtorch::CheckMethodOperatorSupport(const torch::jit::Module& module, std::string method_name)`` api - to see what operators are not supported. ``trtorchc`` automatically checks modules with this method before starting compilation and will print out a list of operators that are not supported. - -.. _custom_converters: - -Registering Custom Converters -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Operations are mapped to TensorRT through the use of modular converters, a function that takes a node from a the JIT graph and produces an equivalent layer or subgraph in TensorRT. -TRTorch ships with a library of these converters stored in a registry, that will be executed depending on the node being parsed. For instance a ``aten::relu(%input0.4)`` instruction will trigger -the relu converter to be run on it, producing an activation layer in the TensorRT graph. But since this library is not exhaustive you may need to write your own to get TRTorch -to support your module. - -Shipped with the TRTorch distribution are the internal core API headers. You can therefore access the converter registry and add a converter for the op you need. - -For example, if we try to compile a graph with a build of TRTorch that doesn't support the flatten operation (``aten::flatten``) you may see this error: - -.. code-block:: none - - terminate called after throwing an instance of 'trtorch::Error' - what(): [enforce fail at core/conversion/conversion.cpp:109] Expected converter to be true but got false - Unable to convert node: %input.1 : Tensor = aten::flatten(%x.1, %11, %5) # x.py:25:0 (conversion.AddLayer) - Schema: aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor) - Converter for aten::flatten requested, but no such converter was found. - If you need a converter for this operator, you can try implementing one yourself - or request a converter: https://www.github.com/NVIDIA/TRTorch/issues - -We can register a converter for this operator in our application. All of the tools required to build a converter can be imported by including ``trtorch/core/conversion/converters/converters.h``. -We start by creating an instance of the self-registering class ``trtorch::core::conversion::converters::RegisterNodeConversionPatterns()`` which will register converters -in the global converter registry, associating a function schema like ``aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor)`` with a lambda that -will take the state of the conversion, the node/operation in question to convert and all of the inputs to the node and produces as a side effect a new layer in the TensorRT network. -Arguments are passed as a vector of inspectable unions of TensorRT ``ITensors`` and Torch ``IValues`` in the order arguments are listed in the schema. - -Below is a implementation of a ``aten::flatten`` converter that we can use in our application. You have full access to the Torch and TensorRT libraries in the converter implementation. So -for example we can quickly get the output size by just running the operation in PyTorch instead of implementing the full calculation outself like we do below for this flatten converter. - -.. code-block:: c++ - - #include "torch/script.h" - #include "trtorch/trtorch.h" - #include "trtorch/core/conversion/converters/converters.h" - - static auto flatten_converter = trtorch::core::conversion::converters::RegisterNodeConversionPatterns() - .pattern({ - "aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor)", - [](trtorch::core::conversion::ConversionCtx* ctx, - const torch::jit::Node* n, - trtorch::core::conversion::converters::args& args) -> bool { - auto in = args[0].ITensor(); - auto start_dim = args[1].unwrapToInt(); - auto end_dim = args[2].unwrapToInt(); - auto in_shape = trtorch::core::util::toVec(in->getDimensions()); - auto out_shape = torch::flatten(torch::rand(in_shape), start_dim, end_dim).sizes(); - - auto shuffle = ctx->net->addShuffle(*in); - shuffle->setReshapeDimensions(trtorch::core::util::toDims(out_shape)); - shuffle->setName(trtorch::core::util::node_info(n).c_str()); - - auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0)); - return true; - } - }); - - int main() { - ... - -To use this converter in Python, it is recommended to use PyTorch's `C++ / CUDA Extention `_ -template to wrap your library of converters into a ``.so`` that you can load with ``ctypes.CDLL()`` in your Python application. - -You can find more information on all the details of writing converters in the contributors documentation (:ref:`writing_converters`). -If you find yourself with a large library of converter implementations, do consider upstreaming them, PRs are welcome and it would be great for the community to benefit as well. - diff --git a/docsrc/tutorials/getting_started_with_cpp_api.rst b/docsrc/tutorials/getting_started_with_cpp_api.rst new file mode 100644 index 0000000000..7e9ee2c618 --- /dev/null +++ b/docsrc/tutorials/getting_started_with_cpp_api.rst @@ -0,0 +1,338 @@ +.. _getting_started: + +Getting Started with C++ +======================== + +If you haven't already, acquire a tarball of the library by following the instructions in :ref:`Installation` + +Using Torch-TensorRT in C++ +*************************** +Torch-TensorRT C++ API accepts TorchScript modules (generated either from ``torch.jit.script`` or ``torch.jit.trace``) as an input and returns +a Torchscript module (optimized using TensorRT). This requires users to use Pytorch (in python) to generate torchscript modules beforehand. +Please refer to `Creating TorchScript modules in Python `_ section to generate torchscript graphs. + + +.. _torch_tensorrt_quickstart: + +[Torch-TensorRT Quickstart] Compiling TorchScript Modules with ``torchtrtc`` +--------------------------------------------------------------------- + +An easy way to get started with Torch-TensorRT and to check if your model can be supported without extra work is to run it through +``torchtrtc``, which supports almost all features of the compiler from the command line including post training quantization +(given a previously created calibration cache). For example we can compile our lenet model by setting our preferred operating +precision and input size. This new TorchScript file can be loaded into Python (note: you need to ``import torch_tensorrt`` before loading +these compiled modules because the compiler extends the PyTorch the deserializer and runtime to execute compiled modules). + +.. code-block:: shell + + ❯ torchtrtc -p f16 lenet_scripted.ts trt_lenet_scripted.ts "(1,1,32,32)" + + ❯ python3 + Python 3.6.9 (default, Apr 18 2020, 01:56:04) + [GCC 8.4.0] on linux + Type "help", "copyright", "credits" or "license" for more information. + >>> import torch + >>> import torch_tensorrt + >>> ts_model = torch.jit.load(“trt_lenet_scripted.ts”) + >>> ts_model(torch.randn((1,1,32,32)).to(“cuda”).half()) + +You can learn more about ``torchtrtc`` usage here: :ref:`torchtrtc` + +.. _ts_in_cc: + +Working with TorchScript in C++ +-------------------------------- + +If we are developing an application to deploy with C++, we can save either our traced or scripted module using ``torch.jit.save`` +which will serialize the TorchScript code, weights and other information into a package. This is also where our dependency on Python ends. + +.. code-block:: python + + torch_script_module.save("lenet.jit.pt") + +From here we can now load our TorchScript module in C++ + +.. code-block:: c++ + + #include // One-stop header. + + #include + #include + + int main(int argc, const char* argv[]) { + torch::jit::Module module; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + module = torch::jit::load(""); + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + return -1; + } + + std::cout << "ok\n"; + + +You can do full training and inference in C++ with PyTorch / LibTorch if you would like, you can even define your modules in C++ and +have access to the same powerful tensor library that backs PyTorch. (For more information: https://pytorch.org/cppdocs/). +For instance we can do inference with our LeNet module like this: + +.. code-block:: c++ + + mod.eval(); + torch::Tensor in = torch::randn({1, 1, 32, 32}); + auto out = mod.forward(in); + +and to run on the GPU: + +.. code-block:: c++ + + mod.eval(); + mod.to(torch::kCUDA); + torch::Tensor in = torch::randn({1, 1, 32, 32}, torch::kCUDA); + auto out = mod.forward(in); + +As you can see it is pretty similar to the Python API. When you call the ``forward`` method, you invoke the PyTorch JIT compiler, which will optimize and run your TorchScript code. + +.. _compile_cpp: + +Compiling with Torch-TensorRT in C++ +------------------------------------- +We are also at the point were we can compile and optimize our module with Torch-TensorRT, but instead of in a JIT fashion we must do it ahead-of-time (AOT) i.e. before we start doing actual inference work +since it takes a bit of time to optimize the module, it would not make sense to do this every time you run the module or even the first time you run it. + +With our module loaded, we can feed it into the Torch-TensorRT compiler. When we do so we must provide some information on the expected input size and also configure any additional settings. + +.. code-block:: c++ + + #include "torch/script.h" + #include "torch_tensorrt/torch_tensorrt.h" + ... + + mod.to(at::kCUDA); + mod.eval(); + + auto in = torch::randn({1, 1, 32, 32}, {torch::kCUDA}); + auto trt_mod = torch_tensorrt::CompileGraph(mod, std::vector{{in.sizes()}}); + auto out = trt_mod.forward({in}); + +Thats it! Now the graph runs primarily not with the JIT compiler but using TensorRT (though we execute the graph using the JIT runtime). + +We can also set settings like operating precision to run in FP16. + +.. code-block:: c++ + + #include "torch/script.h" + #include "torch_tensorrt/torch_tensorrt.h" + ... + + mod.to(at::kCUDA); + mod.eval(); + + auto in = torch::randn({1, 1, 32, 32}, {torch::kCUDA}).to(torch::kHALF); + auto input_sizes = std::vector({in.sizes()}); + torch_tensorrt::CompileSpec info(input_sizes); + info.enable_precisions.insert(torch::kHALF); + auto trt_mod = torch_tensorrt::CompileGraph(mod, info); + auto out = trt_mod.forward({in}); + +And now we are running the module in FP16 precision. You can then save the module to load later. + +.. code-block:: c++ + + trt_mod.save("") + +Torch-TensorRT compiled TorchScript modules are loaded in the same way as normal TorchScript module. Make sure your deployment application is linked against ``libtorchtrt.so`` + +.. code-block:: c++ + + #include "torch/script.h" + #include "torch_tensorrt/torch_tensorrt.h" + + int main(int argc, const char* argv[]) { + torch::jit::Module module; + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + module = torch::jit::load(""); + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + return -1; + } + + torch::Tensor in = torch::randn({1, 1, 32, 32}, torch::kCUDA); + auto out = mod.forward(in); + + std::cout << "ok\n"; + } + +If you want to save the engine produced by Torch-TensorRT to use in a TensorRT application you can use the ``ConvertGraphToTRTEngine`` API. + +.. code-block:: c++ + + #include "torch/script.h" + #include "torch_tensorrt/torch_tensorrt.h" + ... + + mod.to(at::kCUDA); + mod.eval(); + + auto in = torch::randn({1, 1, 32, 32}, {torch::kCUDA}).to(torch::kHALF); + auto input_sizes = std::vector({in.sizes()}); + torch_tensorrt::CompileSpec info(input_sizes); + info.enabled_precisions.insert(torch::kHALF); + auto trt_mod = torch_tensorrt::ConvertGraphToTRTEngine(mod, "forward", info); + std::ofstream out("/tmp/engine_converted_from_jit.trt"); + out << engine; + out.close(); + +.. _under_the_hood: + +Under The Hood +--------------- + +When a module is provided to Torch-TensorRT, the compiler starts by mapping a graph like you saw above to a graph like this: + +.. code-block:: none + + graph(%input.2 : Tensor): + %2 : Float(84, 10) = prim::Constant[value=]() + %3 : Float(120, 84) = prim::Constant[value=]() + %4 : Float(576, 120) = prim::Constant[value=]() + %5 : int = prim::Constant[value=-1]() # x.py:25:0 + %6 : int[] = prim::Constant[value=annotate(List[int], [])]() + %7 : int[] = prim::Constant[value=[2, 2]]() + %8 : int[] = prim::Constant[value=[0, 0]]() + %9 : int[] = prim::Constant[value=[1, 1]]() + %10 : bool = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 + %11 : int = prim::Constant[value=1]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 + %12 : bool = prim::Constant[value=0]() # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 + %self.classifer.fc3.bias : Float(10) = prim::Constant[value= 0.0464 0.0383 0.0678 0.0932 0.1045 -0.0805 -0.0435 -0.0818 0.0208 -0.0358 [ CUDAFloatType{10} ]]() + %self.classifer.fc2.bias : Float(84) = prim::Constant[value=]() + %self.classifer.fc1.bias : Float(120) = prim::Constant[value=]() + %self.feat.conv2.weight : Float(16, 6, 3, 3) = prim::Constant[value=]() + %self.feat.conv2.bias : Float(16) = prim::Constant[value=]() + %self.feat.conv1.weight : Float(6, 1, 3, 3) = prim::Constant[value=]() + %self.feat.conv1.bias : Float(6) = prim::Constant[value= 0.0530 -0.1691 0.2802 0.1502 0.1056 -0.1549 [ CUDAFloatType{6} ]]() + %input0.4 : Tensor = aten::_convolution(%input.2, %self.feat.conv1.weight, %self.feat.conv1.bias, %9, %8, %9, %12, %8, %11, %12, %12, %10) # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 + %input0.5 : Tensor = aten::relu(%input0.4) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 + %input1.2 : Tensor = aten::max_pool2d(%input0.5, %7, %6, %8, %9, %12) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 + %input0.6 : Tensor = aten::_convolution(%input1.2, %self.feat.conv2.weight, %self.feat.conv2.bias, %9, %8, %9, %12, %8, %11, %12, %12, %10) # ~/.local/lib/python3.6/site-packages/torch/nn/modules/conv.py:346:0 + %input2.1 : Tensor = aten::relu(%input0.6) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 + %x.1 : Tensor = aten::max_pool2d(%input2.1, %7, %6, %8, %9, %12) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:539:0 + %input.1 : Tensor = aten::flatten(%x.1, %11, %5) # x.py:25:0 + %27 : Tensor = aten::matmul(%input.1, %4) + %28 : Tensor = trt::const(%self.classifer.fc1.bias) + %29 : Tensor = aten::add_(%28, %27, %11) + %input0.2 : Tensor = aten::relu(%29) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 + %31 : Tensor = aten::matmul(%input0.2, %3) + %32 : Tensor = trt::const(%self.classifer.fc2.bias) + %33 : Tensor = aten::add_(%32, %31, %11) + %input1.1 : Tensor = aten::relu(%33) # ~/.local/lib/python3.6/site-packages/torch/nn/functional.py:1063:0 + %35 : Tensor = aten::matmul(%input1.1, %2) + %36 : Tensor = trt::const(%self.classifer.fc3.bias) + %37 : Tensor = aten::add_(%36, %35, %11) + return (%37) + (CompileGraph) + +The graph has now been transformed from a collection of modules, each managing their own parameters into a single graph with the parameters inlined +into the graph and all of the operations laid out. Torch-TensorRT has also executed a number of optimizations and mappings to make the graph easier to translate to TensorRT. +From here the compiler can assemble the TensorRT engine by following the dataflow through the graph. + +When the graph construction phase is complete, Torch-TensorRT produces a serialized TensorRT engine. From here depending on the API, this engine is returned +to the user or moves into the graph construction phase. Here Torch-TensorRT creates a JIT Module to execute the TensorRT engine which will be instantiated and managed +by the Torch-TensorRT runtime. + +Here is the graph that you get back after compilation is complete: + +.. code-block:: none + + graph(%self_1 : __torch__.lenet, %input_0 : Tensor): + %1 : ...trt.Engine = prim::GetAttr[name="lenet"](%self_1) + %3 : Tensor[] = prim::ListConstruct(%input_0) + %4 : Tensor[] = trt::execute_engine(%3, %1) + %5 : Tensor = prim::ListUnpack(%4) + return (%5) + + +You can see the call where the engine is executed, after extracting the attribute containing the engine and constructing a list of inputs, then returns the tensors back to the user. + +.. _unsupported_ops: + +Working with Unsupported Operators +----------------------------------- + +Torch-TensorRT is a new library and the PyTorch operator library is quite large, so there will be ops that aren't supported natively by the compiler. You can either use the composition techinques +shown above to make modules are fully Torch-TensorRT supported and ones that are not and stitch the modules together in the deployment application or you can register converters for missing ops. + + You can check support without going through the full compilation pipleine using the ``torch_tensorrt::CheckMethodOperatorSupport(const torch::jit::Module& module, std::string method_name)`` api + to see what operators are not supported. ``torchtrtc`` automatically checks modules with this method before starting compilation and will print out a list of operators that are not supported. + +.. _custom_converters: + +Registering Custom Converters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Operations are mapped to TensorRT through the use of modular converters, a function that takes a node from a the JIT graph and produces an equivalent layer or subgraph in TensorRT. +Torch-TensorRT ships with a library of these converters stored in a registry, that will be executed depending on the node being parsed. For instance a ``aten::relu(%input0.4)`` instruction will trigger +the relu converter to be run on it, producing an activation layer in the TensorRT graph. But since this library is not exhaustive you may need to write your own to get Torch-TensorRT +to support your module. + +Shipped with the Torch-TensorRT distribution are the internal core API headers. You can therefore access the converter registry and add a converter for the op you need. + +For example, if we try to compile a graph with a build of Torch-TensorRT that doesn't support the flatten operation (``aten::flatten``) you may see this error: + +.. code-block:: none + + terminate called after throwing an instance of 'torch_tensorrt::Error' + what(): [enforce fail at core/conversion/conversion.cpp:109] Expected converter to be true but got false + Unable to convert node: %input.1 : Tensor = aten::flatten(%x.1, %11, %5) # x.py:25:0 (conversion.AddLayer) + Schema: aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor) + Converter for aten::flatten requested, but no such converter was found. + If you need a converter for this operator, you can try implementing one yourself + or request a converter: https://www.github.com/NVIDIA/Torch-TensorRT/issues + +We can register a converter for this operator in our application. All of the tools required to build a converter can be imported by including ``torch_tensorrt/core/conversion/converters/converters.h``. +We start by creating an instance of the self-registering class ``torch_tensorrt::core::conversion::converters::RegisterNodeConversionPatterns()`` which will register converters +in the global converter registry, associating a function schema like ``aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor)`` with a lambda that +will take the state of the conversion, the node/operation in question to convert and all of the inputs to the node and produces as a side effect a new layer in the TensorRT network. +Arguments are passed as a vector of inspectable unions of TensorRT ``ITensors`` and Torch ``IValues`` in the order arguments are listed in the schema. + +Below is a implementation of a ``aten::flatten`` converter that we can use in our application. You have full access to the Torch and TensorRT libraries in the converter implementation. So +for example we can quickly get the output size by just running the operation in PyTorch instead of implementing the full calculation outself like we do below for this flatten converter. + +.. code-block:: c++ + + #include "torch/script.h" + #include "torch_tensorrt/torch_tensorrt.h" + #include "torch_tensorrt/core/conversion/converters/converters.h" + + static auto flatten_converter = torch_tensorrt::core::conversion::converters::RegisterNodeConversionPatterns() + .pattern({ + "aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor)", + [](torch_tensorrt::core::conversion::ConversionCtx* ctx, + const torch::jit::Node* n, + torch_tensorrt::core::conversion::converters::args& args) -> bool { + auto in = args[0].ITensor(); + auto start_dim = args[1].unwrapToInt(); + auto end_dim = args[2].unwrapToInt(); + auto in_shape = torch_tensorrt::core::util::toVec(in->getDimensions()); + auto out_shape = torch::flatten(torch::rand(in_shape), start_dim, end_dim).sizes(); + + auto shuffle = ctx->net->addShuffle(*in); + shuffle->setReshapeDimensions(torch_tensorrt::core::util::toDims(out_shape)); + shuffle->setName(torch_tensorrt::core::util::node_info(n).c_str()); + + auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0)); + return true; + } + }); + + int main() { + ... + +To use this converter in Python, it is recommended to use PyTorch's `C++ / CUDA Extention `_ +template to wrap your library of converters into a ``.so`` that you can load with ``ctypes.CDLL()`` in your Python application. + +You can find more information on all the details of writing converters in the contributors documentation (:ref:`writing_converters`). +If you find yourself with a large library of converter implementations, do consider upstreaming them, PRs are welcome and it would be great for the community to benefit as well. diff --git a/docsrc/tutorials/getting_started_with_python_api.rst b/docsrc/tutorials/getting_started_with_python_api.rst new file mode 100644 index 0000000000..7861ad6d93 --- /dev/null +++ b/docsrc/tutorials/getting_started_with_python_api.rst @@ -0,0 +1,47 @@ +.. _getting_started_with_python_api: + +Using Torch-TensorRT in Python +******************************* + +Torch-TensorRT Python API accepts a ```torch.nn.Module`` as an input. Under the hood, it uses ``torch.jit.script`` to convert the input module into a +TorchScript module. To compile your input ```torch.nn.Module`` with Torch-TensorRT, all you need to do is provide the module and inputs +to Torch-TensorRT and you will be returned an optimized TorchScript module to run or add into another PyTorch module. Inputs +is a list of ``torch_tensorrt.Input`` classes which define input's shape, datatype and memory format. You can also specify settings such as +operating precision for the engine or target device. After compilation you can save the module just like any other module +to load in a deployment application. In order to load a TensorRT/TorchScript module, make sure you first import ``torch_tensorrt``. + +.. code-block:: python + + import torch_tensorrt + + ... + + model = MyModel().eval() # torch module needs to be in eval (not training) mode + + inputs = [torch_tensorrt.Input( + min_shape=[1, 1, 16, 16], + opt_shape=[1, 1, 32, 32], + max_shape=[1, 1, 64, 64], + dtype=torch.half, + )] + enabled_precisions = {torch.float, torch.half} # Run with fp16 + + trt_ts_module = torch_tensorrt.compile(model, inputs=inputs, enabled_precisions=enabled_precisions) + + input_data = input_data.to('cuda').half() + result = trt_ts_module(input_data) + torch.jit.save(trt_ts_module, "trt_ts_module.ts") + +.. code-block:: python + + # Deployment application + import torch + import torch_tensorrt + + trt_ts_module = torch.jit.load("trt_ts_module.ts") + input_data = input_data.to('cuda').half() + result = trt_ts_module(input_data) + +Torch-TensorRT python API also provides ``torch_tensorrt.ts.compile`` which accepts a TorchScript module as input. +The torchscript module can be obtained via scripting or tracing (refer to :ref:`creating_torchscript_module_in_python`). ``torch_tensorrt.ts.compile`` accepts a Torchscript module +and a list of ``torch_tensorrt.Input`` classes. diff --git a/docsrc/tutorials/installation.rst b/docsrc/tutorials/installation.rst index 8efeba131e..e251d6c171 100644 --- a/docsrc/tutorials/installation.rst +++ b/docsrc/tutorials/installation.rst @@ -25,14 +25,14 @@ You can install the python package using .. code-block:: sh - pip3 install trtorch -f https://github.com/NVIDIA/TRTorch/releases + pip3 install torch_tensorrt -f https://github.com/NVIDIA/Torch-TensorRT/releases .. _bin-dist: C++ Binary Distribution ------------------------ -Precompiled tarballs for releases are provided here: https://github.com/NVIDIA/TRTorch/releases +Precompiled tarballs for releases are provided here: https://github.com/NVIDIA/Torch-TensorRT/releases .. _compile-from-source: @@ -44,7 +44,7 @@ Compiling From Source Dependencies for Compilation ------------------------------- -TRTorch is built with Bazel, so begin by installing it. +Torch-TensorRT is built with Bazel, so begin by installing it. * The easiest way is to install bazelisk using the method of you choosing https://github.com/bazelbuild/bazelisk * Otherwise you can use the following instructions to install binaries https://docs.bazel.build/versions/master/install.html @@ -66,7 +66,7 @@ the CUDA driver installed and the container must have CUDA) The correct LibTorch version will be pulled down for you by bazel. - NOTE: For best compatability with official PyTorch, use torch==1.9.1+cuda111, TensorRT 8.0 and cuDNN 8.2 for CUDA 11.1 however TRTorch itself supports + NOTE: For best compatability with official PyTorch, use torch==1.9.1+cuda111, TensorRT 8.0 and cuDNN 8.2 for CUDA 11.1 however Torch-TensorRT itself supports TensorRT and cuDNN for CUDA versions other than 11.1 for usecases such as using NVIDIA compiled distributions of PyTorch that use other versions of CUDA e.g. aarch64 or custom compiled version of PyTorch. @@ -77,9 +77,9 @@ You then have two compilation options: **Building using cuDNN & TensorRT tarball distributions** -------------------------------------------------------------- - This is recommended so as to build TRTorch hermetically and insures any compilation errors are not caused by version issues + This is recommended so as to build Torch-TensorRT hermetically and insures any compilation errors are not caused by version issues - Make sure when running TRTorch that these versions of the libraries are prioritized in your ``$LD_LIBRARY_PATH`` + Make sure when running Torch-TensorRT that these versions of the libraries are prioritized in your ``$LD_LIBRARY_PATH`` You need to download the tarball distributions of TensorRT and cuDNN from the NVIDIA website. * https://developer.nvidia.com/cudnn @@ -96,7 +96,7 @@ Release Build .. code-block:: shell - bazel build //:libtrtorch -c opt --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu] + bazel build //:libtorchtrt -c opt --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu] A tarball with the include files and library can then be found in ``bazel-bin`` @@ -109,7 +109,7 @@ To build with debug symbols use the following command .. code-block:: shell - bazel build //:libtrtorch -c dbg --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu] + bazel build //:libtorchtrt -c dbg --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu] A tarball with the include files and library can then be found in ``bazel-bin`` @@ -120,7 +120,7 @@ To build using the pre-CXX11 ABI use the ``pre_cxx11_abi`` config .. code-block:: shell - bazel build //:libtrtorch --config pre_cxx11_abi -c [dbg/opt] --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu] + bazel build //:libtorchtrt --config pre_cxx11_abi -c [dbg/opt] --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu] A tarball with the include files and library can then be found in ``bazel-bin`` @@ -180,7 +180,7 @@ Compile using: .. code-block:: shell - bazel build //:libtrtorch -c opt + bazel build //:libtorchtrt -c opt A tarball with the include files and library can then be found in ``bazel-bin`` @@ -193,7 +193,7 @@ To build with debug symbols use the following command .. code-block:: shell - bazel build //:libtrtorch -c dbg + bazel build //:libtorchtrt -c dbg A tarball with the include files and library can then be found in ``bazel-bin`` @@ -205,14 +205,14 @@ To build using the pre-CXX11 ABI use the ``pre_cxx11_abi`` config .. code-block:: shell - bazel build //:libtrtorch --config pre_cxx11_abi -c [dbg/opt] + bazel build //:libtorchtrt --config pre_cxx11_abi -c [dbg/opt] **Building the Python package** -------------------------------- Begin by installing ``ninja`` -You can build the Python package using ``setup.py`` (this will also build the correct version of ``libtrtorch.so``) +You can build the Python package using ``setup.py`` (this will also build the correct version of ``libtorchtrt.so``) .. code-block:: shell @@ -225,7 +225,7 @@ Debug Build python3 setup.py develop [--user] -This also compiles a debug build of ``libtrtorch.so`` +This also compiles a debug build of ``libtorchtrt.so`` **Building Natively on aarch64 (Jetson)** ------------------------------------------- @@ -334,11 +334,11 @@ Compile C++ Library and Compiler CLI --platforms //toolchains:jetpack_4.x -Compile TRTorch library using bazel command: +Compile Torch-TensorRT library using bazel command: .. code-block:: shell - bazel build //:libtrtorch --platforms //toolchains:jetpack_4.6 + bazel build //:libtorchtrt --platforms //toolchains:jetpack_4.6 Compile Python API ^^^^^^^^^^^^^^^^^^^^ @@ -353,4 +353,4 @@ Compile the Python API using the following command from the ``//py`` directory: If you have a build of PyTorch that uses Pre-CXX11 ABI drop the ``--use-cxx11-abi`` flag -If you are building for Jetpack 4.5 add the ``--jetpack-version 4.5`` flag \ No newline at end of file +If you are building for Jetpack 4.5 add the ``--jetpack-version 4.5`` flag diff --git a/docsrc/tutorials/ptq.rst b/docsrc/tutorials/ptq.rst index 25145415a7..0d87eeb9f3 100644 --- a/docsrc/tutorials/ptq.rst +++ b/docsrc/tutorials/ptq.rst @@ -10,7 +10,7 @@ and track the activations in FP32 to calibrate a mapping to INT8 that minimizes FP32 inference and INT8 inference. Users writing TensorRT applications are required to setup a calibrator class which will provide sample data to -the TensorRT calibrator. With TRTorch we look to leverage existing infrastructure in PyTorch to make implementing +the TensorRT calibrator. With Torch-TensorRT we look to leverage existing infrastructure in PyTorch to make implementing calibrators easier. LibTorch provides a ``DataLoader`` and ``Dataset`` API which steamlines preprocessing and batching input data. @@ -18,9 +18,9 @@ These APIs are exposed via both C++ and Python interface which makes it easier f For C++ interface, we use ``torch::Dataset`` and ``torch::data::make_data_loader`` objects to construct and perform pre-processing on datasets. The equivalent functionality in python interface uses ``torch.utils.data.Dataset`` and ``torch.utils.data.DataLoader``. This section of the PyTorch documentation has more information https://pytorch.org/tutorials/advanced/cpp_frontend.html#loading-data and https://pytorch.org/tutorials/recipes/recipes/loading_data_recipe.html. -TRTorch uses Dataloaders as the base of a generic calibrator implementation. So you will be able to reuse or quickly +Torch-TensorRT uses Dataloaders as the base of a generic calibrator implementation. So you will be able to reuse or quickly implement a ``torch::Dataset`` for your target domain, place it in a DataLoader and create a INT8 Calibrator -which you can provide to TRTorch to run INT8 Calibration during compliation of your module. +which you can provide to Torch-TensorRT to run INT8 Calibration during compliation of your module. .. _writing_ptq_cpp: @@ -97,17 +97,17 @@ some take time, then define the preprocessing to apply to the images in the data .workers(2)); -Next we create a calibrator from the ``calibration_dataloader`` using the calibrator factory (found in ``trtorch/ptq.h``): +Next we create a calibrator from the ``calibration_dataloader`` using the calibrator factory (found in ``torch_tensorrt/ptq.h``): .. code-block:: c++ - #include "trtorch/ptq.h" + #include "torch_tensorrt/ptq.h" ... - auto calibrator = trtorch::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true); + auto calibrator = torch_tensorrt::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true); Here we also define a location to write a calibration cache file to which we can use to reuse the calibration data without needing the dataset and whether or not -we should use the cache file if it exists. There also exists a ``trtorch::ptq::make_int8_cache_calibrator`` factory which creates a calibrator that uses the cache +we should use the cache file if it exists. There also exists a ``torch_tensorrt::ptq::make_int8_cache_calibrator`` factory which creates a calibrator that uses the cache only for cases where you may do engine building on a machine that has limited storage (i.e. no space for a full dataset) or to have a simpiler deployment application. The calibrator factories create a calibrator that inherits from a ``nvinfer1::IInt8Calibrator`` virtual class (``nvinfer1::IInt8EntropyCalibrator2`` by default) which @@ -116,15 +116,15 @@ defines the calibration algorithm used when calibrating. You can explicitly make .. code-block:: c++ // MinMax Calibrator is geared more towards NLP tasks - auto calibrator = trtorch::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true); + auto calibrator = torch_tensorrt::ptq::make_int8_calibrator(std::move(calibration_dataloader), calibration_cache_file, true); -Then all thats required to setup the module for INT8 calibration is to set the following compile settings in the `trtorch::CompileSpec` struct and compiling the module: +Then all thats required to setup the module for INT8 calibration is to set the following compile settings in the `torch_tensorrt::CompileSpec` struct and compiling the module: .. code-block:: c++ std::vector> input_shape = {{32, 3, 32, 32}}; /// Configure settings for compilation - auto compile_spec = trtorch::CompileSpec({input_shape}); + auto compile_spec = torch_tensorrt::CompileSpec({input_shape}); /// Set operating precision to INT8 compile_spec.enabled_precisions.insert(torch::kF16); compile_spec.enabled_precisions.insert(torch::kI8); @@ -133,19 +133,19 @@ Then all thats required to setup the module for INT8 calibration is to set the f /// Set a larger workspace (you may get better performace from doing so) compile_spec.workspace_size = 1 << 28; - auto trt_mod = trtorch::CompileGraph(mod, compile_spec); + auto trt_mod = torch_tensorrt::CompileGraph(mod, compile_spec); If you have an existing Calibrator implementation for TensorRT you may directly set the ``ptq_calibrator`` field with a pointer to your calibrator and it will work as well. From here not much changes in terms of how to execution works. You are still able to fully use LibTorch as the sole interface for inference. Data should remain -in FP32 precision when it's passed into `trt_mod.forward`. There exists an example application in the TRTorch demo that takes you from training a VGG16 network on -CIFAR10 to deploying in INT8 with TRTorch here: https://github.com/NVIDIA/TRTorch/tree/master/cpp/ptq +in FP32 precision when it's passed into `trt_mod.forward`. There exists an example application in the Torch-TensorRT demo that takes you from training a VGG16 network on +CIFAR10 to deploying in INT8 with Torch-TensorRT here: https://github.com/NVIDIA/Torch-TensorRT/tree/master/cpp/ptq .. _writing_ptq_python: How to create your own PTQ application in Python -------------------------------------------------- -TRTorch Python API provides an easy and convenient way to use pytorch dataloaders with TensorRT calibrators. ``DataLoaderCalibrator`` class can be used to create +Torch-TensorRT Python API provides an easy and convenient way to use pytorch dataloaders with TensorRT calibrators. ``DataLoaderCalibrator`` class can be used to create a TensorRT calibrator by providing desired configuration. The following code demonstrates an example on how to use it .. code-block:: python @@ -163,49 +163,49 @@ a TensorRT calibrator by providing desired configuration. The following code dem batch_size=1, shuffle=False, num_workers=1) - calibrator = trtorch.ptq.DataLoaderCalibrator(testing_dataloader, + calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(testing_dataloader, cache_file='./calibration.cache', use_cache=False, - algo_type=trtorch.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2, + algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2, device=torch.device('cuda:0')) compile_spec = { - "inputs": [trtorch.Input((1, 3, 32, 32))], + "inputs": [torch_tensorrt.Input((1, 3, 32, 32))], "enabled_precisions": {torch.float, torch.half, torch.int8}, "calibrator": calibrator, "device": { - "device_type": trtorch.DeviceType.GPU, + "device_type": torch_tensorrt.DeviceType.GPU, "gpu_id": 0, "dla_core": 0, "allow_gpu_fallback": False, "disable_tf32": False } } - trt_mod = trtorch.compile(model, compile_spec) + trt_mod = torch_tensorrt.compile(model, compile_spec) In the cases where there is a pre-existing calibration cache file that users want to use, ``CacheCalibrator`` can be used without any dataloaders. The following example demonstrates how to use ``CacheCalibrator`` to use in INT8 mode. .. code-block:: python - calibrator = trtorch.ptq.CacheCalibrator("./calibration.cache") + calibrator = torch_tensorrt.ptq.CacheCalibrator("./calibration.cache") compile_settings = { - "inputs": [trtorch.Input([1, 3, 32, 32])], + "inputs": [torch_tensorrt.Input([1, 3, 32, 32])], "enabled_precisions": {torch.float, torch.half, torch.int8}, "calibrator": calibrator, "max_batch_size": 32, } - trt_mod = trtorch.compile(model, compile_settings) + trt_mod = torch_tensorrt.compile(model, compile_settings) If you already have an existing calibrator class (implemented directly using TensorRT API), you can directly set the calibrator field to your class which can be very convenient. -For a demo on how PTQ can be performed on a VGG network using TRTorch API, you can refer to https://github.com/NVIDIA/TRTorch/blob/master/tests/py/test_ptq_dataloader_calibrator.py -and https://github.com/NVIDIA/TRTorch/blob/master/tests/py/test_ptq_trt_calibrator.py +For a demo on how PTQ can be performed on a VGG network using Torch-TensorRT API, you can refer to https://github.com/NVIDIA/Torch-TensorRT/blob/master/tests/py/test_ptq_dataloader_calibrator.py +and https://github.com/NVIDIA/Torch-TensorRT/blob/master/tests/py/test_ptq_trt_calibrator.py Citations ^^^^^^^^^^^ Krizhevsky, A., & Hinton, G. (2009). Learning multiple layers of features from tiny images. -Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556. \ No newline at end of file +Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556. diff --git a/docsrc/tutorials/runtime.rst b/docsrc/tutorials/runtime.rst index 2aecc007b6..4f278e20b2 100644 --- a/docsrc/tutorials/runtime.rst +++ b/docsrc/tutorials/runtime.rst @@ -1,13 +1,13 @@ .. _runtime: -Deploying TRTorch Programs +Deploying Torch-TensorRT Programs =========================== -After compiling and saving TRTorch programs there is no longer a strict dependency on the full -TRTorch library. All that is required to run a compiled program is the runtime. There are therfore a couple -options to deploy your programs other than shipping the full trtorch compiler with your applications. +After compiling and saving Torch-TensorRT programs there is no longer a strict dependency on the full +Torch-TensorRT library. All that is required to run a compiled program is the runtime. There are therfore a couple +options to deploy your programs other than shipping the full Torch-TensorRT compiler with your applications. -TRTorch package / libtrtorch.so +Torch-TensorRT package / libtorchtrt.so --------------------------------- Once a program is compiled, you run it using the standard PyTorch APIs. All that is required is that the package @@ -16,21 +16,21 @@ must be imported in python or linked in C++. Runtime Library ----------------- -Distributed with the C++ distribution is ``libtrtorchrt.so``. This library only contains the components -necessary to run TRTorch programs. Instead of linking ``libtrtorch.so`` or importing ``trtorch`` you can -link ``libtrtorchrt.so`` in your deployment programs or use ``DL_OPEN`` or ``LD_PRELOAD``. For python -you can load the runtime with ``torch.ops.load_library("libtrtorchrt.so")``. You can then continue to use +Distributed with the C++ distribution is ``libtorchtrt_runtime.so``. This library only contains the components +necessary to run Torch-TensorRT programs. Instead of linking ``libtorchtrt.so`` or importing ``torch_tensorrt`` you can +link ``libtorchtrt_runtime.so`` in your deployment programs or use ``DL_OPEN`` or ``LD_PRELOAD``. For python +you can load the runtime with ``torch.ops.load_library("libtorchtrt_runtime.so")``. You can then continue to use programs just as you would otherwise via PyTorch API. -.. note:: If you are using the standard distribution of PyTorch in Python on x86, likely you will need the pre-cxx11-abi variant of ``libtrtorchrt.so``, check :ref:`Installation` documentation for more details. +.. note:: If you are using the standard distribution of PyTorch in Python on x86, likely you will need the pre-cxx11-abi variant of ``libtorchtrt_runtime.so``, check :ref:`Installation` documentation for more details. -.. note:: If you are linking ``libtrtorchrt.so``, likely using the following flags will help ``-Wl,--no-as-needed -ltrtorchrt -Wl,--as-needed`` as theres no direct symbol dependency to anything in the TRTorch runtime for most TRTorch runtime applications +.. note:: If you are linking ``libtorchtrt_runtime.so``, likely using the following flags will help ``-Wl,--no-as-needed -ltorchtrt -Wl,--as-needed`` as theres no direct symbol dependency to anything in the Torch-TensorRT runtime for most Torch-TensorRT runtime applications -An example of how to use ``libtrtorchrt.so`` can be found here: https://github.com/NVIDIA/TRTorch/tree/master/examples/trtorchrt_example +An example of how to use ``libtorchtrt_runtime.so`` can be found here: https://github.com/NVIDIA/Torch-TensorRT/tree/master/examples/torchtrt_example Plugin Library --------------- -In the case you use TRTorch as a converter to a TensorRT engine and your engine uses plugins provided by TRTorch, TRTorch -ships the library ``libtrtorch_plugins.so`` which contains the implementation of the TensorRT plugins used by TRTorch during +In the case you use Torch-TensorRT as a converter to a TensorRT engine and your engine uses plugins provided by Torch-TensorRT, Torch-TensorRT +ships the library ``libtorchtrt_plugins.so`` which contains the implementation of the TensorRT plugins used by Torch-TensorRT during compilation. This library can be ``DL_OPEN`` or ``LD_PRELOAD`` similar to other TensorRT plugin libraries. diff --git a/docsrc/tutorials/trtorchc.rst b/docsrc/tutorials/torchtrtc.rst similarity index 92% rename from docsrc/tutorials/trtorchc.rst rename to docsrc/tutorials/torchtrtc.rst index c98e84a64e..9d870d9ed6 100644 --- a/docsrc/tutorials/trtorchc.rst +++ b/docsrc/tutorials/torchtrtc.rst @@ -1,25 +1,25 @@ -.. _trtorchc: +.. _torchtrtc: -trtorchc +torchtrtc ================================= -``trtorchc`` is a CLI application for using the TRTorch compiler. It serves as an easy way to compile a -TorchScript Module with TRTorch from the command-line to quickly check support or as part of +``torchtrtc`` is a CLI application for using the Torch-TensorRT compiler. It serves as an easy way to compile a +TorchScript Module with Torch-TensorRT from the command-line to quickly check support or as part of a deployment pipeline. All basic features of the compiler are supported including post training quantization (though you must already have a calibration cache file to use the PTQ feature). The compiler can output two formats, either a TorchScript program with the TensorRT engine embedded or the TensorRT engine itself as a PLAN file. -All that is required to run the program after compilation is for C++ linking against ``libtrtorch.so`` -or in Python importing the trtorch package. All other aspects of using compiled modules are identical +All that is required to run the program after compilation is for C++ linking against ``libtorchtrt.so`` +or in Python importing the torch_tensorrt package. All other aspects of using compiled modules are identical to standard TorchScript. Load with ``torch.jit.load()`` and run like you would run any other module. .. code-block:: txt - trtorchc [input_file_path] [output_file_path] + torchtrtc [input_file_path] [output_file_path] [input_specs...] {OPTIONS} - TRTorch is a compiler for TorchScript, it will compile and optimize + Torch-TensorRT is a compiler for TorchScript, it will compile and optimize TorchScript programs to run on NVIDIA GPUs using TensorRT OPTIONS: @@ -128,4 +128,4 @@ e.g. .. code-block:: shell - trtorchc tests/modules/ssd_traced.jit.pt ssd_trt.ts "[(1,3,300,300); (1,3,512,512); (1, 3, 1024, 1024)]@f16%contiguous" -p f16 + torchtrtc tests/modules/ssd_traced.jit.pt ssd_trt.ts "[(1,3,300,300); (1,3,512,512); (1, 3, 1024, 1024)]@f16%contiguous" -p f16 diff --git a/docsrc/tutorials/use_from_pytorch.rst b/docsrc/tutorials/use_from_pytorch.rst index 2a6b87f8c6..3eb86647c5 100644 --- a/docsrc/tutorials/use_from_pytorch.rst +++ b/docsrc/tutorials/use_from_pytorch.rst @@ -1,17 +1,17 @@ .. _use_from_pytorch: -Using TRTorch Directly From PyTorch +Using Torch-TensorRT Directly From PyTorch ==================================== -Starting in TRTorch 0.1.0, you will now be able to directly access TensorRT from PyTorch APIs. The process to use this feature -is very similar to the compilation workflow described in :ref:`getting_started` +You will now be able to directly access TensorRT from PyTorch APIs. The process to use this feature +is very similar to the compilation workflow described in :ref:`getting_started_with_python_api` -Start by loading ``trtorch`` into your application. +Start by loading ``torch_tensorrt`` into your application. .. code-block:: python import torch - import trtorch + import torch_tensorrt Then given a TorchScript module, you can compile it with TensorRT using the ``torch._C._jit_to_backend("tensorrt", ...)`` API. @@ -23,36 +23,36 @@ Then given a TorchScript module, you can compile it with TensorRT using the ``to model = models.mobilenet_v2(pretrained=True) script_model = torch.jit.script(model) -Unlike the ``compile`` API in TRTorch which assumes you are trying to compile the ``forward`` function of a module +Unlike the ``compile`` API in Torch-TensorRT which assumes you are trying to compile the ``forward`` function of a module or the ``convert_method_to_trt_engine`` which converts a specified function to a TensorRT engine, the backend API will take a dictionary which maps names of functions to compile to Compilation Spec objects which wrap the same sort of dictionary you would provide to ``compile``. For more information on the compile spec dictionary take a look -at the documentation for the TRTorch ``TensorRTCompileSpec`` API. +at the documentation for the Torch-TensorRT ``TensorRTCompileSpec`` API. .. code-block:: python spec = { "forward": - trtorch.TensorRTCompileSpec({ - "inputs": [trtorch.Input([1, 3, 300, 300])], + torch_tensorrt.TensorRTCompileSpec({ + "inputs": [torch_tensorrt.Input([1, 3, 300, 300])], "enabled_precisions": {torch.float, torch.half}, "refit": False, "debug": False, "strict_types": False, "device": { - "device_type": trtorch.DeviceType.GPU, + "device_type": torch_tensorrt.DeviceType.GPU, "gpu_id": 0, "dla_core": 0, "allow_gpu_fallback": True }, - "capability": trtorch.EngineCapability.default, + "capability": torch_tensorrt.EngineCapability.default, "num_min_timing_iters": 2, "num_avg_timing_iters": 1, "max_batch_size": 0, }) } -Now to compile with TRTorch, provide the target module objects and the spec dictionary to ``torch._C._jit_to_backend("tensorrt", ...)`` +Now to compile with Torch-TensorRT, provide the target module objects and the spec dictionary to ``torch._C._jit_to_backend("tensorrt", ...)`` .. code-block:: python @@ -64,4 +64,3 @@ To run explicitly call the function of the method you want to run (vs. how you c input = torch.randn((1, 3, 300, 300)).to("cuda").to(torch.half) print(trt_model.forward(input)) - diff --git a/docsrc/tutorials/using_dla.rst b/docsrc/tutorials/using_dla.rst index 6e70e407b3..ab40b942b3 100644 --- a/docsrc/tutorials/using_dla.rst +++ b/docsrc/tutorials/using_dla.rst @@ -3,26 +3,26 @@ DLA ================================= -``DLA`` NVIDIA Deep Learning Accelerator is a fixed-function accelerator engine targeted for deep learning operations. DLA is designed to do full hardware acceleration of convolutional neural networks. DLA supports various layers such as convolution, deconvolution, fully-connected, activation, pooling, batch normalization, etc. ``trtorch`` supports compilation of TorchScript Module and deployment pipeline on the DLA hardware available on NVIDIA embedded platforms. +``DLA`` NVIDIA Deep Learning Accelerator is a fixed-function accelerator engine targeted for deep learning operations. DLA is designed to do full hardware acceleration of convolutional neural networks. DLA supports various layers such as convolution, deconvolution, fully-connected, activation, pooling, batch normalization, etc. ``torch_tensorrt`` supports compilation of TorchScript Module and deployment pipeline on the DLA hardware available on NVIDIA embedded platforms. NOTE: DLA supports fp16 and int8 precision only. -Using DLA with trtorchc +Using DLA with torchtrtc .. code-block:: shell - trtorchc [input_file_path] [output_file_path] [input_shapes...] -p f16 -d dla {OPTIONS} + torchtrtc [input_file_path] [output_file_path] [input_shapes...] -p f16 -d dla {OPTIONS} Using DLA in a C++ application .. code-block:: c++ std::vector> input_shape = {{32, 3, 32, 32}}; - auto compile_spec = trtorch::CompileSpec({input_shape}); + auto compile_spec = torch_tensorrt::CompileSpec({input_shape}); # Set a precision. DLA supports fp16 or int8 only compile_spec.enabled_precisions = {torch::kF16}; - compile_spec.device.device_type = trtorch::CompileSpec::DeviceType::kDLA; + compile_spec.device.device_type = torch_tensorrt::CompileSpec::DeviceType::kDLA; # Make sure the gpu id is set to Xavier id for DLA compile_spec.device.gpu_id = 0; @@ -42,9 +42,9 @@ Using DLA in a python application .. code-block:: python compile_spec = { - "inputs": [trtorch.Input(self.input.shape)], - "device": trtorch.Device("dla:0", allow_gpu_fallback=True), + "inputs": [torch_tensorrt.Input(self.input.shape)], + "device": torch_tensorrt.Device("dla:0", allow_gpu_fallback=True), "enalbed_precisions": {torch.half} } - trt_mod = trtorch.compile(self.scripted_model, compile_spec) + trt_mod = torch_tensorrt.compile(self.scripted_model, compile_spec) diff --git a/examples/trtorchrt_example/BUILD b/examples/torchtrt_example/BUILD similarity index 100% rename from examples/trtorchrt_example/BUILD rename to examples/torchtrt_example/BUILD diff --git a/examples/trtorchrt_example/Makefile b/examples/torchtrt_example/Makefile similarity index 100% rename from examples/trtorchrt_example/Makefile rename to examples/torchtrt_example/Makefile diff --git a/examples/trtorchrt_example/README.md b/examples/torchtrt_example/README.md similarity index 100% rename from examples/trtorchrt_example/README.md rename to examples/torchtrt_example/README.md diff --git a/examples/trtorchrt_example/deps/.gitkeep b/examples/torchtrt_example/deps/.gitkeep similarity index 100% rename from examples/trtorchrt_example/deps/.gitkeep rename to examples/torchtrt_example/deps/.gitkeep diff --git a/examples/trtorchrt_example/main.cpp b/examples/torchtrt_example/main.cpp similarity index 100% rename from examples/trtorchrt_example/main.cpp rename to examples/torchtrt_example/main.cpp diff --git a/examples/trtorchrt_example/network.py b/examples/torchtrt_example/network.py similarity index 100% rename from examples/trtorchrt_example/network.py rename to examples/torchtrt_example/network.py diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 330661878f..d377d912df 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -78,4 +78,4 @@ def convert_method_to_trt_engine(module: Any, elif target_ir == _IRType.fx: raise RuntimeError("fx is currently not supported") else: - raise RuntimeError("Module is an unknown format or the ir requested is unknown") \ No newline at end of file + raise RuntimeError("Module is an unknown format or the ir requested is unknown") diff --git a/py/torch_tensorrt/_version.py b/py/torch_tensorrt/_version.py index 919f0a67b1..603ad3d010 100644 --- a/py/torch_tensorrt/_version.py +++ b/py/torch_tensorrt/_version.py @@ -1 +1 @@ -__version__ = "1.0.0a0+808e603f" \ No newline at end of file +__version__ = "1.0.0a0+483ef591" \ No newline at end of file