diff --git a/.vscode/launch.json b/.vscode/launch.json
index ccd7bf5f..7bbe9a08 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1520,6 +1520,29 @@
                 }
             ]
         },
+        {
+            "name": "(gdb) Holoinfer: tests",
+            "type": "cppdbg",
+            "request": "launch",
+            "program": "${command:cmake.buildDirectory}/gtests/HOLOINFER_TEST",
+            "args": [],
+            "stopAtEntry": false,
+            "cwd": "${command:cmake.buildDirectory}",
+            "externalConsole": false,
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                },
+                {
+                    "description": "Set Disassembly Flavor to Intel",
+                    "text": "-gdb-set disassembly-flavor intel",
+                    "ignoreFailures": true
+                }
+            ]
+        },
         {
             "name": "(gdb) Holoviz: functional tests",
             "type": "cppdbg",
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6174db62..91e12916 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -121,7 +121,10 @@
         "ranges": "cpp",
         "barrier": "cpp",
         "latch": "cpp",
-        "syncstream": "cpp"
+        "syncstream": "cpp",
+        "__functional_base_03": "cpp",
+        "annotated_ptr": "cpp",
+        "stream_ref": "cpp"
     },
     "git.alwaysSignOff": true,
     "git.untrackedChanges": "separate",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49f489b7..ad320d71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(HOLOSCAN_DOWNLOAD_DATASETS "Download SDK Datasets" ON)
 option(HOLOSCAN_BUILD_TESTS "Build Holoscan SDK Tests" ON)
 option(HOLOSCAN_USE_CCACHE "Use ccache for building Holoscan SDK" OFF)
 option(HOLOSCAN_INSTALL_EXAMPLE_SOURCE "Install the example source code" ON)
+option(HOLOSCAN_ENABLE_CLANG_TIDY "Enable use of clang-tidy" OFF)
 
 # ##############################################################################
 # # Prerequisite statements
@@ -254,6 +255,12 @@ install(DIRECTORY ${cli11_SOURCE_DIR}/include/CLI/
     COMPONENT holoscan-core
 )
 
+# Copy bundled spdlog headers
+install(DIRECTORY ${spdlog_SOURCE_DIR}/include/spdlog/
+    DESTINATION include/3rdparty/spdlog
+    COMPONENT holoscan-core
+)
+
 # Copy version file
 install(FILES ${${HOLOSCAN_PACKAGE_NAME}_BINARY_DIR}/include/holoscan/version_config.hpp
     DESTINATION include/holoscan
@@ -268,6 +275,7 @@ install(DIRECTORY
   ${GXF_INCLUDE_DIR}/gxf/cuda
   ${GXF_INCLUDE_DIR}/gxf/logger
   ${GXF_INCLUDE_DIR}/gxf/multimedia
+  ${GXF_INCLUDE_DIR}/gxf/rmm
   ${GXF_INCLUDE_DIR}/gxf/serialization
   ${GXF_INCLUDE_DIR}/gxf/std
   ${GXF_INCLUDE_DIR}/gxf/ucx
@@ -438,6 +446,11 @@ if(HOLOSCAN_BUILD_TESTS)
     add_test(NAME HOLOVIZ_UNIT_TEST COMMAND holoscan::viz::unittests)
 endif()
 
+# If enabling clang-tidy
+if(HOLOSCAN_ENABLE_CLANG_TIDY)
+  set(CMAKE_CXX_CLANG_TIDY "clang-tidy;--checks=*,-llvmlibc-restrict-system-libc-headers,-fuchsia-default-arguments-calls,-llvmlibc-implementation-in-namespace,-readability-magic-numbers,-readability-identifier-length,-readability-magic-numbers,-cppcoreguidelines-avoid-magic-numbers,-altera-unroll-loops,-llvmlibc-callee-namespace,-google-build-using-namespace,-llvm-include-order,-bugprone-exception-escape,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-modernize-use-trailing-return-type,-altera-id-dependent-backward-branch,-fuchsia-default-arguments-declarations,-bugprone-easily-swappable-parameters,-clang-diagnostic-ignored-optimization-argument;--extra-arg=-std=c++17")
+endif()
+
 if(HOLOSCAN_BUILD_PYTHON)
     add_subdirectory(python)
 endif()
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
index 4bb37344..26abe35c 100644
--- a/CPPLINT.cfg
+++ b/CPPLINT.cfg
@@ -1,6 +1,7 @@
 # mark cpp and cu files as header too so build/include_what_you_use will be applied
 headers=cpp,cu,cuh,h,hpp
-filter=-build/header_guard,-readability/todo,-runtime/references,-build/c++11,-runtime/int,-build/include_subdir,-build/namespaces,-readability/casting
+# need -readability/nolint to ignore unrecognized NOLINT categories from clang-tidy
+filter=-build/header_guard,-readability/todo,-readability/nolint,-runtime/references,-build/c++11,-runtime/int,-build/include_subdir,-build/namespaces,-readability/casting
 # CPPLINT.cfg file's 'exclude_files' option works only for file/folder names in the same directory as the .cfg file
 # See https://github.com/google/styleguide/issues/220 for more details
 exclude_files=\.cache|build|build-|install|data
diff --git a/DEVELOP.md b/DEVELOP.md
index b4eab4d0..cfaf1b63 100644
--- a/DEVELOP.md
+++ b/DEVELOP.md
@@ -81,15 +81,16 @@ To build the Holoscan SDK on a local environment, the following versions of dev
 
 | Dependency | Min version | Needed by | Dockerfile stage |
 |---|---|---|---|
-| CUDA | 12.2 | Core SDK | base |
+| CUDA | 12.6 | Core SDK | base |
 | gRPC | 1.54.2 | Core SDK | grpc-builder |
-| UCX | 1.15.0 | Core SDK | ucx-builder |
-| GXF | 4.0 | Core SDK | gxf-downloader |
-| MOFED | 23.07 | ConnectX | mofed-installer |
-| TensorRT | 8.6.1 | Inference operator | base |
-| ONNX Runtime | 1.15.1 | Inference operator | onnxruntime-downloader |
-| LibTorch | 2.1.0 | Inference operator<br>(torch plugin) | torch-downloader-[x86_64\|arm64] |
-| TorchVision | 0.16.0 | Inference operator<br>(torch plugin) | torchvision-downloader-[x86_64\|arm64] |
+| UCX | 1.17.0 | Core SDK | base |
+| GXF | 4.1 | Core SDK | gxf-downloader |
+| MOFED | 24.07 | ConnectX | mofed-installer |
+| TensorRT | 10.3 | Inference operator | base |
+| NVPL | 24.03 | LibTorch | build |
+| ONNX Runtime | 1.18.1 | Inference operator | onnxruntime-downloader |
+| LibTorch | 2.5.0 | Inference operator<br>(torch plugin) | torch-downloader-[x86_64\|arm64] |
+| TorchVision | 0.20.0 | Inference operator<br>(torch plugin) | torchvision-downloader-[x86_64\|arm64] |
 | Vulkan SDK | 1.3.216 | Holoviz operator | vulkansdk-builder |
 | Vulkan loader and<br>validation layers | 1.3.204 | Holoviz operator | dev |
 | spirv-tools | 2022.1 | Holoviz operator | dev |
diff --git a/Dockerfile b/Dockerfile
index 64a597f3..a96dc1d1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,20 +20,19 @@
 ############################################################
 # Dependencies ending in _YY.MM are built or extracted from
 # the TensorRT or PyTorch NGC containers of that same version
-ARG ONNX_RUNTIME_VERSION=1.15.1_23.08
-ARG LIBTORCH_VERSION=2.1.0_23.08
-ARG TORCHVISION_VERSION=0.16.0_23.08
+ARG ONNX_RUNTIME_VERSION=1.18.1_38712740_24.08-cuda-12.6
+ARG LIBTORCH_VERSION=2.5.0_24.08
+ARG TORCHVISION_VERSION=0.20.0_24.08
 ARG GRPC_VERSION=1.54.2
-ARG UCX_VERSION=1.15.0
-ARG GXF_VERSION=4.0_20240409_bc03d9d
-ARG MOFED_VERSION=23.10-2.1.3.1
+ARG GXF_VERSION=447_20241004_bf72709
+ARG MOFED_VERSION=24.07-0.6.1.0
 
 ############################################################
 # Base image
 ############################################################
 ARG GPU_TYPE=dgpu
-FROM nvcr.io/nvidia/tensorrt:23.08-py3 AS dgpu_base
-FROM nvcr.io/nvidia/tensorrt:23.12-py3-igpu AS igpu_base
+FROM nvcr.io/nvidia/tensorrt:24.08-py3 AS dgpu_base
+FROM nvcr.io/nvidia/tensorrt:24.08-py3-igpu AS igpu_base
 FROM ${GPU_TYPE}_base AS base
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -84,7 +83,7 @@ ARG ONNX_RUNTIME_VERSION
 # note: built with CUDA and TensorRT providers
 WORKDIR /opt/onnxruntime
 RUN curl -S -L -# -o ort.tgz \
-    https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/onnxruntime/onnxruntime-${ONNX_RUNTIME_VERSION}-cuda-12.2-$(uname -m).tar.gz
+    https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/onnxruntime/onnxruntime-${ONNX_RUNTIME_VERSION}-$(uname -m).tar.gz
 RUN mkdir -p ${ONNX_RUNTIME_VERSION}
 RUN tar -xf ort.tgz -C ${ONNX_RUNTIME_VERSION} --strip-components 2
 
@@ -96,15 +95,20 @@ ARG LIBTORCH_VERSION
 ARG GPU_TYPE
 
 # Download libtorch binaries from artifactory
-# note: extracted from nvcr.io/nvidia/pytorch:23.07-py3
+# note: extracted from nvcr.io/nvidia/pytorch:24.08-py3
 WORKDIR /opt/libtorch/
 RUN ARCH=$(uname -m) && if [ "$ARCH" = "aarch64" ]; then ARCH="${ARCH}-${GPU_TYPE}"; fi && \
     curl -S -# -o libtorch.tgz -L \
         https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/libtorch/libtorch-${LIBTORCH_VERSION}-${ARCH}.tar.gz
 RUN mkdir -p ${LIBTORCH_VERSION}
 RUN tar -xf libtorch.tgz -C ${LIBTORCH_VERSION} --strip-components 1
-# Remove kineto from config to remove warning, not needed by holoscan
+
+# Patch step to remove kineto from config to remove warning, not needed by holoscan
 RUN find . -type f -name "*Config.cmake" -exec sed -i '/kineto/d' {} +
+# Patch step for CMake configuration warning
+COPY patches/libtorch.Caffe2.cmake.patch ${LIBTORCH_VERSION}/share/cmake/Caffe2/cuda.patch
+WORKDIR ${LIBTORCH_VERSION}
+RUN patch -p1 < share/cmake/Caffe2/cuda.patch
 
 ############################################################
 # TorchVision
@@ -172,44 +176,21 @@ RUN UBUNTU_VERSION=$(cat /etc/lsb-release | grep DISTRIB_RELEASE | cut -d= -f2)
 ############################################################
 # UCX
 ############################################################
-FROM mofed-installer AS ucx-builder
-ARG UCX_VERSION
-
-# Clone
-WORKDIR /opt/ucx/
-RUN git clone --depth 1 --branch v${UCX_VERSION} https://github.com/openucx/ucx.git src
-
-# Patch
-WORKDIR /opt/ucx/src
-RUN curl -L https://github.com/openucx/ucx/pull/9341.patch | git apply
+FROM build-tools AS ucx-patcher
 
-# Prerequisites to build
-RUN apt-get update \
-    && apt-get install --no-install-recommends -y \
-        libtool="2.4.6-*" \
-        automake="1:1.16.5-*" \
-    && rm -rf /var/lib/apt/lists/*
-
-# Build and install
-RUN ./autogen.sh
-WORKDIR /opt/ucx/build
-RUN ../src/contrib/configure-release-mt --with-cuda=/usr/local/cuda-12 \
-    --prefix=/opt/ucx/${UCX_VERSION}
-RUN make -j $(( `nproc` > ${MAX_PROC} ? ${MAX_PROC} : `nproc` )) install
-
-# Apply patches for import and run
+# The base container provides custom builds of HPCX libraries without
+# the necessary rpath for non-containerized applications. We patch RPATH
+# for portability when we later repackage these libraries for distribution
+# outside of the container.
 WORKDIR /opt/ucx/${UCX_VERSION}
-# patch cmake config
-RUN sed -i "s|set(prefix.*)|set(prefix \"$(pwd)\")|" lib/cmake/ucx/ucx-targets.cmake
-# patch rpath (relative to ORIGIN)
-RUN patchelf --set-rpath '$ORIGIN' lib/libuc*.so*
-RUN patchelf --set-rpath '$ORIGIN:$ORIGIN/..' lib/ucx/libuc*.so*
-RUN patchelf --set-rpath '$ORIGIN/../lib' bin/*
+RUN patchelf --set-rpath '$ORIGIN' /opt/hpcx/ucx/lib/libuc*.so* \
+    && patchelf --set-rpath '$ORIGIN:$ORIGIN/..' /opt/hpcx/ucx/lib/ucx/libuc*.so* \
+    && patchelf --set-rpath '$ORIGIN/../lib' /opt/hpcx/ucx/bin/*
 
 ############################################################
 # GXF
 ############################################################
-FROM base AS gxf-builder
+FROM base AS gxf-downloader
 ARG GXF_VERSION
 
 WORKDIR /opt/nvidia/gxf
@@ -259,22 +240,21 @@ ENV GRPC=/opt/grpc/${GRPC_VERSION}
 COPY --from=grpc-builder ${GRPC} ${GRPC}
 ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${GRPC}"
 
-# Copy UCX
-ARG UCX_VERSION
-ENV UCX=/opt/ucx/${UCX_VERSION}
-COPY --from=ucx-builder ${UCX} ${UCX}
-ENV PATH="${PATH}:${UCX}/bin"
-ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${UCX}"
-# remove older version of UCX in hpcx install
-RUN rm -rf /opt/hpcx/ucx /usr/local/ucx
-RUN unset OPENUCX_VERSION
-# required for gxf_ucx.so to find ucx
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${UCX}/lib"
+# Copy UCX and set other HPC-X runtime paths
+ENV HPCX=/opt/hpcx
+COPY --from=ucx-patcher ${HPCX}/ucx ${HPCX}/ucx
+ENV PATH="${PATH}:${HPCX}/ucx/bin:${HPCX}/ucc/bin:${HPCX}/ompi/bin"
+ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${HPCX}/ucx"
+# Constrain HPCX's ld config to Holoscan/Torch explicit dependencies,
+# to prevent inadvertently picking up non-expected libraries
+RUN echo "${HPCX}/ucx/lib" > /etc/ld.so.conf.d/hpcx.conf \
+    && echo "${HPCX}/ucc/lib" >> /etc/ld.so.conf.d/hpcx.conf \
+    && echo "${HPCX}/ompi/lib" >> /etc/ld.so.conf.d/hpcx.conf
 
 # Copy GXF
 ARG GXF_VERSION
 ENV GXF=/opt/nvidia/gxf/${GXF_VERSION}
-COPY --from=gxf-builder ${GXF} ${GXF}
+COPY --from=gxf-downloader ${GXF} ${GXF}
 ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${GXF}"
 
 # Setup Docker & NVIDIA Container Toolkit's apt repositories to enable DooD
@@ -289,6 +269,21 @@ RUN install -m 0755 -d /etc/apt/keyrings \
         "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \
         tee /etc/apt/sources.list.d/docker.list > /dev/null
 
+# Install NVIDIA Performance Libraries on arm64 dGPU platform
+# as a runtime requirement for the Holoinfer `libtorch` backend (2.5.0).
+ARG GPU_TYPE
+RUN if [[ $(uname -m) = "aarch64" && ${GPU_TYPE} = "dgpu" ]]; then \
+    curl -L https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/cuda-keyring_1.1-1_all.deb -O \
+    && dpkg -i cuda-keyring_1.1-1_all.deb \
+    && apt-get update \
+    && apt-get install --no-install-recommends -y \
+        nvpl-blas=0.2.0.1-* \
+        nvpl-lapack=0.2.2.1-* \
+    && apt-get purge -y cuda-keyring \
+    && rm cuda-keyring_1.1-1_all.deb \
+    && rm -rf /var/lib/apt/lists/* \
+    ; fi
+
 # APT INSTALLS
 #  valgrind - dynamic analysis
 #  clang-tidy - static analysis
diff --git a/NOTICE.txt b/NOTICE.txt
index ed6065c6..fc6ad65d 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -42,8 +42,8 @@ expected (https://github.com/TartanLlama/expected)
 Licensed under CC0-1.0 (https://github.com/TartanLlama/expected/blob/v1.1.0/COPYING)
 
 fmt (https://github.com/fmtlib/fmt)
-Copyright (c) 2012 - present, Victor Zverovich
-Licensed under MIT (https://github.com/fmtlib/fmt/blob/8.1.1/LICENSE.rst)
+Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
+Licensed under MIT (https://github.com/fmtlib/fmt/blob/10.1.1/LICENSE.rst)
 
 GLFW (https://www.glfw.org/)
 Copyright (c) 2002-2006 Marcus Geelnard
@@ -237,7 +237,7 @@ Licensed under Apache-2.0 (http://www.apache.org/licenses/LICENSE-2.0)
 
 ONNX Runtime (https://github.com/microsoft/onnxruntime)
 Copyright (c) Microsoft Corporation
-Licensed under MIT (https://github.com/microsoft/onnxruntime/blob/v1.15.1/LICENSE)
+Licensed under MIT (https://github.com/microsoft/onnxruntime/blob/v1.18.1/LICENSE)
 
 openblas (https://packages.ubuntu.com/jammy/libopenblas0)
 2011-2021 The OpenBLAS Project
@@ -275,12 +275,16 @@ RAPIDS rapids-cmake (https://github.com/rapidsai/rapids-cmake)
 Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 Licensed under Apache-2.0 (https://github.com/rapidsai/rapids-cmake/blob/branch-23.06/LICENSE)
 
+RMM (https://github.com/rapidsai/rmm)
+Copyright (c) 2018-2024, NVIDIA CORPORATION.
+Licensed under Apache 2.0 (https://github.com/rapidsai/rmm/blob/branch-24.04/LICENSE)
+
 Google Fonts Roboto (https://github.com/googlefonts/roboto/releases/download/v2.138/roboto-android.zip)
 Licensed under Apache-2.0 (https://github.com/googlefonts/roboto/blob/v2.138/LICENSE)
 
 spdlog (https://github.com/gabime/spdlog)
 Copyright (c) 2016 Gabi Melman.
-Licensed under MIT (https://github.com/gabime/spdlog/blob/v1.10.0/LICENSE)
+Licensed under MIT (https://github.com/gabime/spdlog/blob/v1.12.0/LICENSE)
 
 spirv-tools (https://packages.ubuntu.com/jammy/spirv-tools)
 2016-2017 Google Inc.
@@ -310,7 +314,7 @@ Copyright (C) 2016-2020      Advanced Micro Devices, Inc.  All rights reserved.
 Copyright (C) 2019           UChicago Argonne, LLC.  All rights reserved.
 Copyright (C) 2020           Huawei Technologies Co., Ltd. All rights reserved.
 Copyright (C) 2016-2020      Stony Brook University. All rights reserved.
-Licensed under BSD-3-clause (https://github.com/openucx/ucx/blob/v1.15.0-rc2/LICENSE)
+Licensed under BSD-3-clause (https://github.com/openucx/ucx/blob/v1.17.0/LICENSE)
 
 v4l-utils (https://packages.ubuntu.com/jammy/v4l-utils)
 2006-2010 Mauro Carvalho Chehab <mchehab@redhat.com>
diff --git a/README.md b/README.md
index 085eeb8e..d094b6e5 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ The Holoscan User Guide includes:
 - Requirements and setup steps;
 - Detailed SDK documentation, including a developer introduction, examples, and API details.
 
-We also recommend visiting [NVIDIA HoloHub](https://nvidia-holoscan.github.io/holohub/) to view
+We also recommend visiting [NVIDIA HoloHub](https://github.com/nvidia-holoscan/holohub) to view
 community projects and reusable components available for your Holoscan project.
 
 ## Obtaining the Holoscan SDK
@@ -54,7 +54,7 @@ and may include them in Holoscan SDK releases at our discretion. Please refer to
 
 ### Relation to NVIDIA Clara
 
-In previous releases, the prefix [`Clara`](https://developer.nvidia.com/industries/healthcare) was used to define Holoscan as a platform designed initially for [medical devices](https://www.nvidia.com/en-us/clara/developer-kits/). Starting with version 0.4.0, the Holoscan SDK is built to be domain-agnostic and can be used to build sensor AI applications in multiple domains. Domain specific content will be hosted on the [HoloHub](https://nvidia-holoscan.github.io/holohub) repository.
+In previous releases, the prefix [`Clara`](https://developer.nvidia.com/industries/healthcare) was used to define Holoscan as a platform designed initially for [medical devices](https://www.nvidia.com/en-us/clara/developer-kits/). Starting with version 0.4.0, the Holoscan SDK is built to be domain-agnostic and can be used to build sensor AI applications in multiple domains. Domain specific content will be hosted on the [HoloHub](https://github.com/nvidia-holoscan/holohub) repository.
 
 ### Repository structure
 
diff --git a/VERSION b/VERSION
index 437459cd..e70b4523 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.5.0
+2.6.0
diff --git a/cmake/deps/fmt_rapids.cmake b/cmake/deps/fmt_rapids.cmake
index f102de45..e3ca3635 100644
--- a/cmake/deps/fmt_rapids.cmake
+++ b/cmake/deps/fmt_rapids.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,13 +16,13 @@
 # https://docs.rapids.ai/api/rapids-cmake/stable/command/rapids_cpm_find.html
 include(${rapids-cmake-dir}/cpm/find.cmake)
 
-rapids_cpm_find(fmt 8.1.1
+rapids_cpm_find(fmt 10.1.1
     GLOBAL_TARGETS fmt fmt-header-only
     BUILD_EXPORT_SET ${HOLOSCAN_PACKAGE_NAME}-exports
     CPM_ARGS
 
     GITHUB_REPOSITORY fmtlib/fmt
-    GIT_TAG 8.1.1
+    GIT_TAG 10.1.1
     GIT_SHALLOW TRUE
 
     OPTIONS
diff --git a/cmake/deps/gxf.cmake b/cmake/deps/gxf.cmake
index 33b9a7ca..de94264f 100644
--- a/cmake/deps/gxf.cmake
+++ b/cmake/deps/gxf.cmake
@@ -21,13 +21,14 @@ set(HOLOSCAN_GXF_COMPONENTS
     gxe
     logger
     multimedia
+    rmm
     sample # dependency of GXF::app
     serialization
     std
     ucx
 )
 
-find_package(GXF 4.0 CONFIG REQUIRED
+find_package(GXF 4.1 CONFIG REQUIRED
     COMPONENTS ${HOLOSCAN_GXF_COMPONENTS}
 )
 message(STATUS "Found GXF: ${GXF_DIR}")
@@ -92,7 +93,7 @@ foreach(component ${HOLOSCAN_GXF_COMPONENTS})
             )
 
             # Patch `gxe` executable RUNPATH to find required GXF libraries in the self-contained HSDK installation.
-            # GXF 4.0 libraries are entirely self-contained and do not require RPATH updates.
+            # GXF libraries are entirely self-contained and do not require RPATH updates.
             find_program(PATCHELF_EXECUTABLE patchelf)
             if(PATCHELF_EXECUTABLE)
                 execute_process(
diff --git a/cmake/deps/patches/rmm.patch b/cmake/deps/patches/rmm.patch
new file mode 100644
index 00000000..e51b3e14
--- /dev/null
+++ b/cmake/deps/patches/rmm.patch
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+diff --git CMakeLists.txt CMakeLists.txt
+index 56454d4b..0a59296 100644
+--- CMakeLists.txt
++++ CMakeLists.txt
+@@ -12,7 +12,7 @@
+ # the License.
+ # =============================================================================
+ 
+-cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
++cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
+ 
+ include(rapids_config.cmake)
+ 
+@@ -131,6 +131,7 @@ endif()
+ 
+ # ##################################################################################################
+ # * install targets --------------------------------------------------------------------------------
++if(OFF) # disable exports in Holoscan SDK build
+ 
+ include(CPack)
+ 
+@@ -167,6 +168,7 @@ rapids_export(
+   NAMESPACE rmm::
+   DOCUMENTATION doc_string)
+ 
++endif()
+ # ##################################################################################################
+ # * make documentation -----------------------------------------------------------------------------
+ 
+diff --git include/rmm/cuda_stream_view.hpp include/rmm/cuda_stream_view.hpp
+index a34897d..231aae9 100644
+--- include/rmm/cuda_stream_view.hpp
++++ include/rmm/cuda_stream_view.hpp
+@@ -18,6 +18,7 @@
+
+ #include <rmm/detail/error.hpp>
+
++#define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
+ #include <cuda/stream_ref>
+ #include <cuda_runtime_api.h>
+
+diff --git include/rmm/logger.hpp include/rmm/logger.hpp
+index 6213221..edaf605 100644
+--- include/rmm/logger.hpp
++++ include/rmm/logger.hpp
+@@ -16,6 +16,7 @@
+
+ #pragma once
+
++#define FMT_HEADER_ONLY
+ #include <fmt/format.h>
+ #include <fmt/ostream.h>
+ #include <spdlog/sinks/basic_file_sink.h>
+
+diff --git include/rmm/mr/device/cuda_async_memory_resource.hpp include/rmm/mr/device/cuda_async_memory_resource.hpp
+index ac6b7207..702efae6 100644
+--- include/rmm/mr/device/cuda_async_memory_resource.hpp
++++ include/rmm/mr/device/cuda_async_memory_resource.hpp
+@@ -85,22 +85,22 @@ class cuda_async_memory_resource final : public device_memory_resource {
+    * resource should support interprocess communication (IPC). Default is
+    * `cudaMemHandleTypeNone` for no IPC support.
+    */
+-  // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
+-  template <class Optional,
+-            cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
+-                                                        thrust::optional<std::size_t>>,
+-                                   int> = 0>
+-  [[deprecated("Use std::optional instead of thrust::optional.")]]  //
+-  explicit cuda_async_memory_resource(
+-    Optional initial_pool_size,
+-    Optional release_threshold                                  = {},
+-    thrust::optional<allocation_handle_type> export_handle_type = {})
+-    : cuda_async_memory_resource(initial_pool_size.value_or(std::nullopt),
+-                                 release_threshold.value_or(std::nullopt),
+-                                 export_handle_type.value_or(std::nullopt))
++  // // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
++  // template <class Optional,
++  //           cuda::std::enable_if_t<cuda::std::is_same_v<cuda::std::remove_cvref_t<Optional>,
++  //                                                       thrust::optional<std::size_t>>,
++  //                                  int> = 0>
++  // [[deprecated("Use std::optional instead of thrust::optional.")]]  //
++  // explicit cuda_async_memory_resource(
++  //   Optional initial_pool_size,
++  //   Optional release_threshold                                  = {},
++  //   thrust::optional<allocation_handle_type> export_handle_type = {})
++  //   : cuda_async_memory_resource(initial_pool_size.value_or(std::nullopt),
++  //                                release_threshold.value_or(std::nullopt),
++  //                                export_handle_type.value_or(std::nullopt))
+ 
+-  {
+-  }
++  // {
++  // }
+ 
+   /**
+    * @brief Constructs a cuda_async_memory_resource with the optionally specified initial pool size
diff --git a/cmake/deps/pybind11.cmake b/cmake/deps/pybind11.cmake
index 4cebdcf5..3aa140e6 100644
--- a/cmake/deps/pybind11.cmake
+++ b/cmake/deps/pybind11.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +17,13 @@
 # https://docs.rapids.ai/api/rapids-cmake/stable/command/rapids_find_package.html#
 include(${rapids-cmake-dir}/cpm/find.cmake)
 
-rapids_cpm_find(pybind11 2.11.1
+rapids_cpm_find(pybind11 2.13.6
     GLOBAL_TARGETS pybind11
 
     CPM_ARGS
 
     GITHUB_REPOSITORY pybind/pybind11
-    GIT_TAG v2.11.1
+    GIT_TAG v2.13.6
     GIT_SHALLOW TRUE
     EXCLUDE_FROM_ALL
 )
diff --git a/cmake/deps/rmm.cmake b/cmake/deps/rmm.cmake
new file mode 100644
index 00000000..4fbde882
--- /dev/null
+++ b/cmake/deps/rmm.cmake
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://docs.rapids.ai/api/rapids-cmake/stable/command/rapids_cpm_find.html
+include(${rapids-cmake-dir}/cpm/find.cmake)
+
+set(rmm_PATCH_FILEPATH "${CMAKE_SOURCE_DIR}/cmake/deps/patches/rmm.patch")
+
+rapids_cpm_find(rmm 24.04.00
+    GLOBAL_TARGETS rmm
+    BUILD_EXPORT_SET ${HOLOSCAN_PACKAGE_NAME}-exports
+    CPM_ARGS
+
+    GITHUB_REPOSITORY rapidsai/rmm
+    GIT_TAG v24.04.00
+    GIT_SHALLOW TRUE
+    PATCH_COMMAND git apply -p0 "${rmm_PATCH_FILEPATH}"
+
+    EXCLUDE_FROM_ALL
+)
+
+if(rmm_ADDED)
+    # Install the headers needed for development with the SDK
+    install(DIRECTORY ${rmm_SOURCE_DIR}/include/rmm
+        DESTINATION "include"
+        COMPONENT "holoscan-dependencies"
+        )
+endif()
diff --git a/cmake/deps/spdlog_rapids.cmake b/cmake/deps/spdlog_rapids.cmake
index 0fb2e5c2..74baaa7a 100644
--- a/cmake/deps/spdlog_rapids.cmake
+++ b/cmake/deps/spdlog_rapids.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,7 +19,7 @@ include(${rapids-cmake-dir}/cpm/find.cmake)
 # Here we are using rapids_cpm_find() function instead of rapids_cpm_spdlog() function
 # (https://docs.rapids.ai/api/rapids-cmake/stable/packages/rapids_cpm_spdlog.html), to
 # override the default options.
-set(version 1.10.0)
+set(version 1.14.1)
 
 rapids_cpm_find(spdlog ${version}
     GLOBAL_TARGETS spdlog::spdlog spdlog::spdlog_header_only
diff --git a/cmake/deps/tensorrt.cmake b/cmake/deps/tensorrt.cmake
index b9f12e8e..1ce9cc25 100644
--- a/cmake/deps/tensorrt.cmake
+++ b/cmake/deps/tensorrt.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-find_package(TensorRT 8 MODULE REQUIRED)
+find_package(TensorRT 10 MODULE REQUIRED)
diff --git a/cmake/deps/ucx.cmake b/cmake/deps/ucx.cmake
index 20fa4f2f..af23ffc6 100644
--- a/cmake/deps/ucx.cmake
+++ b/cmake/deps/ucx.cmake
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-find_package(ucx 1.14.0 REQUIRED)
+find_package(ucx 1.17.0 REQUIRED)
 
 install(
   DIRECTORY ${UCX_LIBRARIES}
diff --git a/cmake/modules/FindONNXRuntime.cmake b/cmake/modules/FindONNXRuntime.cmake
index 1e0aee1d..9f8c10a7 100644
--- a/cmake/modules/FindONNXRuntime.cmake
+++ b/cmake/modules/FindONNXRuntime.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,10 +29,10 @@
 
 # Find headers
 find_path(ONNXRuntime_INCLUDE_DIR
-  NAMES onnxruntime/core/session/onnxruntime_c_api.h
+  NAMES onnxruntime/onnxruntime_c_api.h
 )
 list(APPEND ONNXRuntime_INCLUDE_DIRS "${ONNXRuntime_INCLUDE_DIR}")
-list(APPEND ONNXRuntime_INCLUDE_DIRS "${ONNXRuntime_INCLUDE_DIR}/onnxruntime/core/session")
+list(APPEND ONNXRuntime_INCLUDE_DIRS "${ONNXRuntime_INCLUDE_DIR}/onnxruntime")
 mark_as_advanced(ONNXRuntime_INCLUDE_DIR)
 mark_as_advanced(ONNXRuntime_INCLUDE_DIRS)
 
diff --git a/cmake/modules/FindTensorRT.cmake b/cmake/modules/FindTensorRT.cmake
index 544a58f4..641d6a94 100644
--- a/cmake/modules/FindTensorRT.cmake
+++ b/cmake/modules/FindTensorRT.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,14 +34,14 @@ mark_as_advanced(TensorRT_INCLUDE_DIR)
 
 # Find version
 function(read_version name str)
-    string(REGEX MATCH "${name} ([0-9]\\d*)" _ ${str})
+    string(REGEX MATCH "${name} ([0-9]+)" _ "${str}")
     set(${name} ${CMAKE_MATCH_1} PARENT_SCOPE)
 endfunction()
 
 file(READ "${TensorRT_INCLUDE_DIR}/NvInferVersion.h" _TRT_VERSION_FILE)
-read_version(NV_TENSORRT_MAJOR ${_TRT_VERSION_FILE})
-read_version(NV_TENSORRT_MINOR ${_TRT_VERSION_FILE})
-read_version(NV_TENSORRT_PATCH ${_TRT_VERSION_FILE})
+read_version(NV_TENSORRT_MAJOR "${_TRT_VERSION_FILE}")
+read_version(NV_TENSORRT_MINOR "${_TRT_VERSION_FILE}")
+read_version(NV_TENSORRT_PATCH "${_TRT_VERSION_FILE}")
 set(TensorRT_VERSION "${NV_TENSORRT_MAJOR}.${NV_TENSORRT_MINOR}.${NV_TENSORRT_PATCH}")
 unset(_TRT_VERSION_FILE)
 
@@ -60,9 +60,7 @@ endmacro()
 
 find_trt_library(nvinfer)
 find_trt_library(nvinfer_plugin)
-find_trt_library(nvcaffe_parser)
 find_trt_library(nvonnxparser)
-find_trt_library(nvparsers)
 
 # Generate TensorRT_FOUND
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/modules/HoloscanCPack.cmake b/cmake/modules/HoloscanCPack.cmake
index 661e62b3..ca695054 100644
--- a/cmake/modules/HoloscanCPack.cmake
+++ b/cmake/modules/HoloscanCPack.cmake
@@ -65,17 +65,19 @@ set(CPACK_COMPONENTS_ALL
 # - cuda-cudart-dev: needed for holoscan core and some operators
 #   Note: only cuda-cudart (non dev) needed at runtime
 set(CPACK_DEBIAN_PACKAGE_DEPENDS
-  "cuda-nvcc-12-2 | cuda-nvcc-12-9 | cuda-nvcc-12-8 | cuda-nvcc-12-7 | cuda-nvcc-12-6 | cuda-nvcc-12-5 | cuda-nvcc-12-4 | cuda-nvcc-12-3 | cuda-nvcc-12-1 | cuda-nvcc-12-0, \
-  cuda-cudart-dev-12-2 | libcudart.so.12-dev"
+  "cuda-nvcc-12-6 | cuda-nvcc-12-9 | cuda-nvcc-12-8 | cuda-nvcc-12-7 | cuda-nvcc-12-5 | cuda-nvcc-12-4 | cuda-nvcc-12-3 | cuda-nvcc-12-2 |cuda-nvcc-12-1 | cuda-nvcc-12-0, \
+  cuda-cudart-dev-12-6 | libcudart.so.12-dev"
 )
-# - libnvinfer-bin: meta package including required nvinfer libs, cublas, and cudnn.
+
+# Recommended packages for core runtime functionality:
+# - libnvinfer-bin: meta package including required nvinfer libs.
 #   Needed for all inference backends
-#   Note: only libnvonnxparsers and libnvinfer-plugin needed at runtime
-# - libcublas: needed by all inference backends
+#   Note: only libnvinfer, libnvonnxparsers, libnvinfer-plugin needed at runtime
+# - libcublas: needed by CuPy, libtorch, and OnnxRuntime
 #   Note: also a dependency of the libnvinfer packages
 # - cuda-nvrtc: libtorch & CuPy dependency
 #   Note: also a dependency of cuda-nvcc
-#   Note: should be able to use libnvrtc.so.12, but doesn't work
+#   Note: should be able to use libnvrtc.so.12, but doesn't work as of Holoscan SDK 2.4
 # - libcufft: needed by cupy and OnnxRuntime inference backend
 # - libcurand: needed by libtorch and cupy
 # - libcusolver: needed by cupy
@@ -83,22 +85,26 @@ set(CPACK_DEBIAN_PACKAGE_DEPENDS
 # - libnpp-dev: needed for format_converter and bayer_demosaic operators
 #   Note: only libnpp (non dev) needed at runtime
 # - libnvjitlink: needed by cupy
+# - nccl2: needed by cupy and Torch
 # - libgomp1: needed by cupy
 # - libvulkan1: needed for holoviz operator
 # - libegl1: needed for holoviz operator in headless mode
 # - libv4l-0: needed for v4l2 operator
 # - python3-cloudpickle: needed for python distributed applications
 # - python3-pip: needed for holoscan CLI (packager, runner)
+# - libnuma1: needed for holoscan::core on ARM64
 set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "\
-libnvinfer-bin (>=8.6), libnvinfer-bin (<<9), \
-libcublas-12-2 | libcublas.so.12, \
-cuda-nvrtc-12-2 | cuda-nvrtc-12-9 | cuda-nvrtc-12-8 | cuda-nvrtc-12-7 | cuda-nvrtc-12-6 | cuda-nvrtc-12-5 | cuda-nvrtc-12-4 | cuda-nvrtc-12-3 | cuda-nvrtc-12-1 | cuda-nvrtc-12-0, \
-libcufft-12-2 | libcufft.so.11, \
-libcurand-12-2 | libcurand.so.10, \
-libcusolver-12-2 | libcusolver.so.11, \
-libcusparse-12-2 | libcusparse.so.12, \
-libnpp-dev-12-2 | libnpp.so.12-dev, \
-libnvjitlink-12-2 | libnvjitlink.so.12, \
+libnvinfer-bin (>=10.3), \
+libcublas-12-6 | libcublas.so.12, \
+cudnn9-cuda-12-6 | libcudnn.so.9, \
+cuda-nvrtc-12-6 | cuda-nvrtc-12-9 | cuda-nvrtc-12-8 | cuda-nvrtc-12-7 | cuda-nvrtc-12-5 | cuda-nvrtc-12-4 | cuda-nvrtc-12-3 | cuda-nvrtc-12-2 | cuda-nvrtc-12-1 | cuda-nvrtc-12-0, \
+libcufft-12-6 | libcufft.so.11, \
+libcurand-12-6 | libcurand.so.10, \
+libcusolver-12-6 | libcusolver.so.11, \
+libcusparse-12-6 | libcusparse.so.12, \
+libnpp-dev-12-6 | libnpp.so.12-dev, \
+libnvjitlink-12-6 | libnvjitlink.so.12, \
+libnccl2 | libnccl.so.2, \
 libgomp1, \
 libvulkan1, \
 libegl1, \
@@ -106,9 +112,27 @@ libv4l-0, \
 python3-cloudpickle, \
 python3-pip"
 )
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "${CPACK_DEBIAN_PACKAGE_RECOMMENDS}, \
+libnuma1")
+endif()
+
+# Packages for optional features:
+# - libcupti: needed for Torch inference backend
+# - libnvToolsExt: needed for Torch inference backend
+# - libcudnn: needed for Torch and OnnxRuntime
+# - libcusparselt: needed for Torch inference backend
 # - libpng, libjpeg, libopenblas: needed for Torch inference backend.
 # - libjpeg needed by v4l2 for mjpeg support
-set(CPACK_DEBIAN_PACKAGE_SUGGESTS "libpng16-16, libjpeg-turbo8, libopenblas0")
+set(CPACK_DEBIAN_PACKAGE_SUGGESTS "\
+cuda-cupti-12-6 | libcupti.so.12, \
+cuda-nvtx-12-6 | libnvToolsExt.so.1, \
+libcudnn9-cuda-12 | libcudnn.so.9, \
+libcusparselt0 | libcusparselt.so.0, \
+libpng16-16, \
+libjpeg-turbo8, \
+libopenblas0"
+)
 
 include(CPack)
 
diff --git a/cmake/modules/HoloscanDownloadData.cmake b/cmake/modules/HoloscanDownloadData.cmake
index 034a15bc..e77d9b51 100644
--- a/cmake/modules/HoloscanDownloadData.cmake
+++ b/cmake/modules/HoloscanDownloadData.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -72,7 +72,6 @@ function(holoscan_download_data dataname)
      --download_dir ${DATA_DOWNLOAD_DIR}
      --download_name ${dataname}
      ${extra_data_options}
-     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/scripts"
      BYPRODUCTS ${DATA_BYPRODUCTS}
   )
 
diff --git a/cmake/modules/SetupRapidsCMake.cmake b/cmake/modules/SetupRapidsCMake.cmake
index e0f51ece..549865d7 100644
--- a/cmake/modules/SetupRapidsCMake.cmake
+++ b/cmake/modules/SetupRapidsCMake.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 # https://github.com/rapidsai/rapids-cmake#installation
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
     ${CMAKE_BINARY_DIR}/RAPIDS.cmake
 )
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
diff --git a/cmake/modules/cpack/NOTICE.txt b/cmake/modules/cpack/NOTICE.txt
index 9acf490c..9b8e3906 100644
--- a/cmake/modules/cpack/NOTICE.txt
+++ b/cmake/modules/cpack/NOTICE.txt
@@ -26,8 +26,8 @@ expected (https://github.com/TartanLlama/expected)
 Licensed under CC0-1.0 (https://github.com/TartanLlama/expected/blob/v1.1.0/COPYING)
 
 fmt (https://github.com/fmtlib/fmt)
-Copyright (c) 2012 - present, Victor Zverovich
-Licensed under MIT (https://github.com/fmtlib/fmt/blob/8.1.1/LICENSE.rst)
+Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
+Licensed under MIT (https://github.com/fmtlib/fmt/blob/10.1.1/LICENSE.rst)
 
 GLFW (https://www.glfw.org/)
 Copyright (c) 2002-2006 Marcus Geelnard
@@ -144,12 +144,16 @@ python-on-whales (https://github.com/gabrieldemarmiesse/python-on-whales)
 Copyright (c) 2020 Gabriel de Marmiesse de Lussan
 Licensed under MIT (https://github.com/gabrieldemarmiesse/python-on-whales/raw/master/LICENSE)
 
+RMM (https://github.com/rapidsai/rmm)
+Copyright (c) 2018-2024, NVIDIA CORPORATION.
+Licensed under Apache 2.0 (https://github.com/rapidsai/rmm/blob/branch-24.04/LICENSE)
+
 Google Fonts Roboto (https://github.com/googlefonts/roboto/releases/download/v2.138/roboto-android.zip)
 Licensed under Apache-2.0 (https://github.com/googlefonts/roboto/blob/v2.138/LICENSE)
 
 spdlog (https://github.com/gabime/spdlog)
 Copyright (c) 2016 Gabi Melman.
-Licensed under MIT (https://github.com/gabime/spdlog/blob/v1.10.0/LICENSE)
+Licensed under MIT (https://github.com/gabime/spdlog/blob/v1.12.0/LICENSE)
 
 UCX (https://openucx.org/)
 Copyright (c) 2014-2015      UT-Battelle, LLC. All rights reserved.
@@ -163,7 +167,7 @@ Copyright (C) 2016-2020      Advanced Micro Devices, Inc.  All rights reserved.
 Copyright (C) 2019           UChicago Argonne, LLC.  All rights reserved.
 Copyright (C) 2020           Huawei Technologies Co., Ltd. All rights reserved.
 Copyright (C) 2016-2020      Stony Brook University. All rights reserved.
-Licensed under BSD-3-clause (https://github.com/openucx/ucx/blob/v1.15.0-rc2/LICENSE)
+Licensed under BSD-3-clause (https://github.com/openucx/ucx/blob/v1.17.0/LICENSE)
 
 yaml-cpp (https://github.com/jbeder/yaml-cpp)
 Copyright (c) 2008-2015 Jesse Beder.
diff --git a/cmake/modules/wrap_operator_as_gxf_template/codelet.cpp.in b/cmake/modules/wrap_operator_as_gxf_template/codelet.cpp.in
index 65cab5b3..d0480d4d 100644
--- a/cmake/modules/wrap_operator_as_gxf_template/codelet.cpp.in
+++ b/cmake/modules/wrap_operator_as_gxf_template/codelet.cpp.in
@@ -20,8 +20,10 @@
 
 namespace @CODELET_NAMESPACE@ {
 
+// NOLINTBEGIN(readability-redundant-member-init)
 @CODELET_NAME@::@CODELET_NAME@() : holoscan::gxf::OperatorWrapper() {
   op_ = std::make_shared<@OPERATOR_CLASS@>();
 }
+// NOLINTEND(readability-redundant-member-init)
 
 }  // namespace @CODELET_NAMESPACE@
diff --git a/cmake/setup_dependencies.cmake b/cmake/setup_dependencies.cmake
index 4c47646b..cfd60ce4 100644
--- a/cmake/setup_dependencies.cmake
+++ b/cmake/setup_dependencies.cmake
@@ -44,6 +44,7 @@ superbuild_depend(grpc)
 superbuild_depend(hwloc)
 superbuild_depend(magic_enum)
 superbuild_depend(spdlog_rapids)
+superbuild_depend(rmm)  # fetches spdlog
 superbuild_depend(tensorrt)
 superbuild_depend(threads)
 superbuild_depend(ucx)
diff --git a/docs/Dockerfile b/docs/Dockerfile
index fe19ed3f..d06afcfb 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -73,7 +73,7 @@ RUN python3 -m pip install --no-cache-dir \
 ################################################################
 FROM $BASE_IMAGE AS docs-html
 
-# Copy over installed denpendencies from docs-base
+# Copy over installed dependencies from docs-base
 COPY --from=docs-base /usr/bin/dot /usr/bin/dot
 COPY --from=docs-base /usr/local/bin/doxygen /usr/local/bin/doxygen
 COPY --from=docs-base /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
diff --git a/docs/api/holoscan_cpp_api.md b/docs/api/holoscan_cpp_api.md
index feaf1336..b5c4ba92 100644
--- a/docs/api/holoscan_cpp_api.md
+++ b/docs/api/holoscan_cpp_api.md
@@ -138,7 +138,11 @@
 - {ref}`exhale_class_classholoscan_1_1AsynchronousCondition`
 - {ref}`exhale_class_classholoscan_1_1BooleanCondition`
 - {ref}`exhale_class_classholoscan_1_1CountCondition`
+- {ref}`exhale_class_classholoscan_1_1CudaBufferAvailableCondition`
+- {ref}`exhale_class_classholoscan_1_1CudaEventCondition`
+- {ref}`exhale_class_classholoscan_1_1CudaStreamCondition`
 - {ref}`exhale_class_classholoscan_1_1DownstreamMessageAffordableCondition`
+- {ref}`exhale_class_classholoscan_1_1ExpiringMessageAvailableCondition`
 - {ref}`exhale_class_classholoscan_1_1MessageAvailableCondition`
 - {ref}`exhale_class_classholoscan_1_1PeriodicCondition`
 
@@ -147,6 +151,7 @@
 - {ref}`exhale_class_classholoscan_1_1Allocator`
 - {ref}`exhale_class_classholoscan_1_1BlockMemoryPool`
 - {ref}`exhale_class_classholoscan_1_1Clock`
+- {ref}`exhale_class_classholoscan_1_1CudaAllocator`
 - {ref}`exhale_class_classholoscan_1_1CudaStreamPool`
 - {ref}`exhale_class_classholoscan_1_1DoubleBufferReceiver`
 - {ref}`exhale_class_classholoscan_1_1DoubleBufferTransmitter`
@@ -154,9 +159,11 @@
 - {ref}`exhale_class_classholoscan_1_1ManualClock`
 - {ref}`exhale_class_classholoscan_1_1RealtimeClock`
 - {ref}`exhale_class_classholoscan_1_1Receiver`
+- {ref}`exhale_class_classholoscan_1_1RMMAllocator`
 - {ref}`exhale_class_classholoscan_1_1SerializationBuffer`
 - {ref}`exhale_class_classholoscan_1_1StdComponentSerializer`
 - {ref}`exhale_class_classholoscan_1_1StdEntitySerializer`
+- {ref}`exhale_class_classholoscan_1_1StreamOrderedAllocator`
 - {ref}`exhale_class_classholoscan_1_1Transmitter`
 - {ref}`exhale_class_classholoscan_1_1UcxComponentSerializer`
 - {ref}`exhale_class_classholoscan_1_1UcxEntitySerializer`
diff --git a/docs/components/resources.md b/docs/components/resources.md
index 17ced065..5cc3efea 100644
--- a/docs/components/resources.md
+++ b/docs/components/resources.md
@@ -21,6 +21,21 @@ This is a memory pool which provides a user-specified number of equally sized bl
 - The `num_blocks` parameter controls the total number of blocks that are allocated in the memory pool.
 - The `dev_id` parameter is an optional parameter that can be used to specify the CUDA ID of the device on which the memory pool will be created.
 
+### RMMAllocator
+
+This allocator provides a pair of memory pools (one is a CUDA device memory pool and the other corresponds to pinned host memory). The underlying implementation is based on the [RAPIDS memory manager](https://github.com/rapidsai/rmm) (RMM) and uses a pair of `rmm::mr::pool_memory_resource` resource types (The device memory pool is a `rmm::mr::cuda_memory_resource` and the host pool is a `rmm::mr::pinned_memory_resource`) . Unlike `BlockMemoryPool`, this allocator can be used with operators like `VideoStreamReplayerOp` that require an allocator capable of allocating both host and device memory. Rather than fixed block sizes, it uses just an initial memory size to allocate and a maximum size that the pool can expand to.
+
+- The `device_memory_initial_size` parameter specifies the initial size of the device (GPU) memory pool. This is an optional parameter that defaults to 8 MB on aarch64 and 16 MB on x86_64. See note below on the format used to specify the value.
+- The `device_memory_max_size` parameter specifies the maximum size of the device (GPU) memory pool in MiB. This is an optional parameter that defaults to twice the value of `device_memory_initial_size`. See note below on the format used to specify the value.
+- The `host_memory_initial_size` parameter specifies the initial size of the device (GPU) memory pool in MiB. This is an optional parameter that defaults to 8 MB on aarch64 and 16 MB on x86_64. See note below on the format used to specify the value.
+- The `host_memory_max_size` parameter  specifies the maximum size of the device (GPU) memory pool in MiB. This is an optional parameter that defaults to twice the value of `host_memory_initial_size`. See note below on the format used to specify the value.
+- The `dev_id` parameter is an optional parameter that can be used to specify the GPU device ID (as an integer) on which the memory pool will be created.
+
+:::{note}
+The values for the memory parameters, such as `device_memory_initial_size` must be specified in the form of a string containing a non-negative integer value followed by a suffix representing the units. Supported units are B, KB, MB, GB and TB where the values are powers of 1024 bytes
+(e.g. MB = 1024 * 1024 bytes). Examples of valid units are "512MB", "256 KB", "1 GB". If a floating point number is specified that decimal portion will be truncated (i.e. the value is rounded down to the nearest integer).
+:::
+
 ### CudaStreamPool
 
 This allocator creates a pool of CUDA streams.
diff --git a/docs/examples/byom.md b/docs/examples/byom.md
index f198b4c8..14f7f335 100644
--- a/docs/examples/byom.md
+++ b/docs/examples/byom.md
@@ -172,7 +172,7 @@ Next, we look at the operators and their parameters defined in the application Y
 ```
 - An instance of the `UnboundedAllocator` resource class is created (line `2`) and used by subsequent operators for
 memory allocation.  This allocator allocates memory dynamically on the host as needed.  For applications where latency
-becomes an issue, there is the `BlockMemoryPool` allocator.
+becomes an issue, an allocator supporting a memory pool such as  `BlockMemoryPool` or `RMMAllocator` could be used instead.
 - The preprocessor operator (line `8`) takes care of converting the input video from the source video to a format that can be used by the AI model.
 - The inference operator (line `12`) feeds the output from the preprocessor to the AI model to perform inference.
 - The postprocessor operator (line `20`) postprocesses the output from the inference operator before passing it downstream to the visualizer.
@@ -381,7 +381,7 @@ In general, when deploying your own AI models, you will need to consider the ope
 - **Output**: The postprocessed stream can be displayed or used by other downstream operators.
 
 The Holoscan SDK comes with a number of [built-in operators](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/src/operators) that you can use to configure your own workflow.
-If needed, you can write your own custom operators or visit [Holohub](https://nvidia-holoscan.github.io/holohub/) for additional implementations and ideas for operators.
+If needed, you can write your own custom operators or visit [Holohub](https://github.com/nvidia-holoscan/holohub) for additional implementations and ideas for operators.
 
 ## Running the Application
 
diff --git a/docs/flow_tracking.md b/docs/flow_tracking.md
index 3fca40b4..b45e6bab 100644
--- a/docs/flow_tracking.md
+++ b/docs/flow_tracking.md
@@ -2,15 +2,12 @@
 # Data Flow Tracking
 
 :::{warning}
-Data Flow Tracking is currently not supported between multiple fragments in a [distributed application](./holoscan_create_distributed_app.md).
+Data Flow Tracking is currently only supported between multiple fragments in a [distributed application](./holoscan_create_distributed_app.md) in a single machine.
 :::
 
-The Holoscan SDK provides the Data Flow Tracking APIs as a mechanism to profile your application and
-analyze the fine-grained timing properties and data flow between operators in the graph of a fragment.
+The Holoscan SDK provides the Data Flow Tracking APIs as a mechanism to profile your application and analyze the fine-grained timing properties and data flow between operators in the graph of a fragment.
 
-Currently, data flow tracking is only supported between the root operators and leaf operators of a
-graph and in simple cycles in a graph (support for tracking data flow between any pair of operators
-in a graph is planned for the future).
+Currently, data flow tracking is only supported between the root operators and leaf operators of a graph and in simple cycles in a graph (support for tracking data flow between any pair of operators in a graph is planned for the future).
 
 - A *root operator* is an operator without any predecessor nodes.
 - A *leaf operator* (also known as a *sink operator*) is an operator without any successor nodes.
@@ -32,10 +29,7 @@ The API also provides the ability to retrieve the number of messages sent from t
 
 ## Enabling Data Flow Tracking
 
-Before an application ({cpp:class}`C++ <holoscan::Application>`/{py:class}`python <holoscan.core.Application>`) is run with the `run()` method,
-data flow tracking can be enabled by calling the `track()` method in
-{cpp:func}`C++ <holoscan::Fragment::track>` and using the `Tracker` class in
-{py:class}`python <holoscan.core.Tracker>`.
+Before an application ({cpp:class}`C++ <holoscan::Application>`/{py:class}`python <holoscan.core.Application>`) is run with the `run()` method, data flow tracking can be enabled. For single fragment applications, this can be done by calling the `track()` method in {cpp:func}`C++ <holoscan::Fragment::track>` and using the `Tracker` class in {py:class}`python <holoscan.core.Tracker>`.
 
 `````{tab-set}
 ````{tab-item} C++
@@ -64,10 +58,37 @@ with Tracker(app) as tracker:
 ````
 `````
 
+## Enabling Data Flow Tracking for Distributed Applications
+
+For distributed (multi-fragment) applications, a separate tracker object is used for each Fragment so the API is slightly different than in the single fragment case.
+
+
+`````{tab-set}
+````{tab-item} C++
+```{code-block} cpp
+:emphasize-lines: 2
+:name: holoscan-enable-data-flow-tracking-cpp
+auto app = holoscan::make_application<MyPingApp>();
+auto trackers = app->track_distributed(); // Enable data flow tracking for a distributed app
+// Change tracker and application configurations
+...
+app->run();
+```
+Note that instead of a returning a single `DataFlowTracker*` like `track`, the `track_distributed` method returns a `std::unordered_map<std::string, DataFlowTracker*>` where the keys are the names of the fragments.
+````
+````{tab-item} Python
+```{code-block} python
+with Tracker(app) as trackers:
+    app.run()
+```
+The `Tracker` context manager detects whether the app is distributed and returns a `dict[str, DataFlowTracker]` as `trackers` in the distributed case. For a single fragment application, the returned value is just a single `DataFlowTracker` object.
+````
+`````
+
 ## Retrieving Data Flow Tracking Results
 
 After an application has been run, data flow tracking results can be accessed by
-various functions:
+various methods on the DataFlowTracker ({cpp:class}`C++ <holoscan::DataFlowTracker>`/{py:class}`python <holoscan.core.DataFlowTracker>`) class.
 
 1. `print()` ({cpp:func}`C++ <holoscan::DataFlowTracker::print>`/{py:func}`python <holoscan.core.DataFlowTracker.print>`)
    - Prints all data flow tracking results including end-to-end latencies and the number of
@@ -117,7 +138,7 @@ tracker.print();
 from holoscan.core import Tracker
 ...
 app = MyPingApp()
-with Tracker(app) as tracker:
+with Tracker(app) as trackers:
   # Change tracker and application configurations
   ...
   app.run()
@@ -126,9 +147,49 @@ with Tracker(app) as tracker:
 ````
 `````
 
+If this was a distributed application, there would instead be a separate `DataFlowTracker` for each fragment. The overall flow tracking results for all fragments can be printed as in the following:
+
+`````{tab-set}
+````{tab-item} C++
+```{code-block} cpp
+:emphasize-lines: 6-10
+:name: holoscan-enable-data-flow-tracking-results-cpp
+auto app = holoscan::make_application<MyPingApp>();
+auto trackers = app->track_distributed(); // Enable data flow tracking for a distributed app
+// Change application configurations
+...
+app->run();
+// print the data flow tracking results
+for (const auto& [name, tracker] : trackers) {
+  std::cout << "Fragment: " << name << std::endl;
+  tracker->print();
+}
+```
+````
+````{tab-item} Python
+```{code-block} python
+:emphasize-lines: 6-9
+:name: holoscan-one-operator-workflow-python
+from holoscan.core import Tracker
+...
+app = MyPingApp()
+with Tracker(app) as trackers:
+  # Change tracker and application configurations
+  ...
+  app.run()
+  # print the data flow tracking results
+  for fragment_name, tracker in trackers.items():
+      print(f"Fragment: {fragment_name}")
+      tracker.print()
+```
+````
+`````
+
 ## Customizing Data Flow Tracking
 
-Data flow tracking can be customized using a few, optional configuration parameters. The `track()` method ({cpp:func}`C++ <holoscan::Fragment::track>`/{py:class}`Tracker class in python <holoscan.core.Tracker>`) can be configured to skip a few messages at the beginning of an application's execution as a *warm-up* period. It is also possible to discard a few messages at the end of an application's run as a *wrap-up* period. Additionally, outlier end-to-end latencies can be ignored by setting a latency threshold value which is the minimum latency below which the observed latencies are ignored.
+Data flow tracking can be customized using a few, optional configuration parameters. The `track()` method ({cpp:func}`C++ <holoscan::Fragment::track>`//{py:func}`Python <holoscan.core.Application.track>`) (or `track_distributed` method ({cpp:func}`C++ <holoscan::Application::track_distributed>`/{py:func}`Python <holoscan.core.Application.track_distributed>`)` for distributed apps) can be configured to skip a few messages at the beginning of an application's execution as a *warm-up* period. It is also possible to discard a few messages at the end of an application's run as a *wrap-up* period. Additionally, outlier end-to-end latencies can be ignored by setting a latency threshold value (in ms) which is the minimum latency below which the observed latencies are ignored.
+
+For Python, it is recommended to use the {py:class}`Tracker<holoscan.core.Tracker>` context manager class instead of the `track` or `track_distributed` methods. This class will autodetect if the application is a single fragment or distributed app, using the appropriate method for each.
 
 :::{tip}
 For effective benchmarking, it is common practice to include warm-up and cool-down periods by skipping the initial and final messages.
@@ -147,8 +208,8 @@ Fragment::track(uint64_t num_start_messages_to_skip = kDefaultNumStartMessagesTo
 ```{code-block} python
 :caption: Optional parameters to `Tracker`
 Tracker(num_start_messages_to_skip=num_start_messages_to_skip,
-            num_last_messages_to_discard=num_last_messages_to_discard,
-            latency_threshold=latency_threshold)
+        num_last_messages_to_discard=num_last_messages_to_discard,
+        latency_threshold=latency_threshold)
 ```
 ````
 `````
@@ -182,7 +243,7 @@ app->run();
 ````
 ````{tab-item} Python
 ```{code-block} python
-:emphasize-lines: 2
+:emphasize-lines: 4
 :name: holoscan-flow-tracking-logging-python
 from holoscan.core import Tracker
 ...
diff --git a/docs/getting_started.md b/docs/getting_started.md
index f423a909..48b01b61 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -23,12 +23,12 @@ The Holoscan SDK does not only provide a framework to build and run applications
 
 The list of existing operators is available [here](./holoscan_operators_extensions.md), which points to the C++ or Python API documentation for more details. Specific documentation is available for the [visualization](./visualization.md) (codename: HoloViz) and [inference](./inference.md) (codename: HoloInfer) operators.
 
-Additionally, [HoloHub](https://nvidia-holoscan.github.io/holohub) is a central repository for users and developers to share reusable operators and sample applications with the Holoscan community, extending the capabilities of the SDK:
+Additionally, [HoloHub](https://github.com/nvidia-holoscan/holohub) is a central repository for users and developers to share reusable operators and sample applications with the Holoscan community, extending the capabilities of the SDK:
 
 - Just like the SDK operators, the HoloHub operators can be used in your own Holoscan applications.
 - The HoloHub sample applications can be used as reference implementations to complete the examples available in the SDK.
 
-Take a glance at [HoloHub](https://nvidia-holoscan.github.io/holohub) to find components you might want to leverage in your application, improve upon existing work, or contribute your own additions to the Holoscan platform.
+Take a glance at [HoloHub](https://github.com/nvidia-holoscan/holohub) to find components you might want to leverage in your application, improve upon existing work, or contribute your own additions to the Holoscan platform.
 
 ## 5. Write and run your own application
 
diff --git a/docs/holoscan_create_distributed_app.md b/docs/holoscan_create_distributed_app.md
index bd1f82b1..dbcfcf4c 100644
--- a/docs/holoscan_create_distributed_app.md
+++ b/docs/holoscan_create_distributed_app.md
@@ -251,6 +251,8 @@ You can set environment variables to modify the default actions of services and
 
 - **HOLOSCAN_UCX_SERIALIZATION_BUFFER_SIZE** : can be used to override the default 7 kB serialization buffer size. This should typically not be needed as tensor types store only a small header in this buffer to avoid explicitly making a copy of their data. However, other data types do get directly copied to the serialization buffer and in some cases it may be necessary to increase it.
 
+- **HOLOSCAN_UCX_ASYNCHRONOUS** : If set to true, asynchronous transmit of UCX messages between fragments is enabled (this is the default). Setting this to False, forces synchronous transmission. Synchronous mode makes it easier to use an allocator like `BlockMemoryPool` as additional tensors would not be queued before the prior one was sent.
+
 - **HOLOSCAN_UCX_DEVICE_ID** : The GPU ID of the device that will be used by UCX transmitter/receivers in distributed applications. If unspecified, it defaults to 0. A list of discrete GPUs available in a system can be obtained via `nvidia-smi -L`. GPU data sent between fragments of a distributed application must be on this device.
 
 - **HOLOSCAN_UCX_PORTS** : This defines the preferred port numbers for the SDK when specific ports for UCX communication need to be predetermined, such as in a Kubernetes environment. If the distributed application requires three ports (UCX receivers) and the environment variable is unset, the SDK chooses three unused ports sequentially from the range 10000~32767. Specifying a value, for example, `HOLOSCAN_UCX_PORTS=10000`, results in the selection of ports 10000, 10001, and 10002. Multiple starting values can be comma-separated. The system increments from the last provided port if more ports are needed. Any unused specified ports are ignored.
diff --git a/docs/hsdk_faq.md b/docs/hsdk_faq.md
index c67a78e2..3009864b 100644
--- a/docs/hsdk_faq.md
+++ b/docs/hsdk_faq.md
@@ -32,13 +32,13 @@ A1: There are multiple ways to  install the Holoscan SDK:
   * For **dGPU** (x86_64, IGX Orin dGPU, Clara AGX dGPU, GH200)
 
 ```
-docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.5.0-dgpu
+docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.6.0-dgpu
 ```
 
   * For **iGPU** (Jetson, IGX Orin iGPU, Clara AGX iGPU)
 
 ```
-docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.5.0-igpu
+docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.6.0-igpu
 ```
 
 For more information, please refer to details and usage instructions on [**NGC**](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara-holoscan/containers/holoscan).
@@ -637,6 +637,9 @@ A36: Yes, for linear inference pipelines or applications with minimal computatio
 
 A37:  Increasing the number of worker threads can improve performance up to a point, but it also increases CPU usage.
 
+**Q38: Is there any memory pool (allocator) that supports both host and device memory?**
+
+Please use `RMMAllocator` for this purpose. It supports simultaneous memory pools for CUDA device memory and pinned host memory. A `BlockMemoryPool` can be used on either host or device memory, but cannot support both types at the same time. `UnboundedAllocator` also supports both host and device memory, but is not a memory pool (it allocates and frees new memory each time).
 
 ## Performance
 
@@ -1144,6 +1147,17 @@ A19:To resolve these errors, edit the `/etc/docker/daemon.json` file to include
 
 You may need to consult your IT team and replace `IP-x` and `DNS-SERVER-x` with the provided values.
 
+**Q20:I am seeing the following error when trying to use the `RMMAllocator`**
+
+When running the application, if it fails to start with an error like the following being logged:
+
+```cpp
+[error] [rmm_allocator.cpp:74] Unexpected error while initializing RMM Allocator rmm_allocator: std::bad_alloc: out_of_memory: RMM failure at:bazel-out/k8-opt/bin/external/rmm/_virtual_includes/rmm/rmm/mr/device/pool_memory_resource.hpp:424: Maximum pool size exceeded
+```
+
+This indicates that the requested memory sizes on host and/or device exceed the available memory. Please make sure that your device supports the specified memory size. Also check that the specified the values for `device_memory_initial_size`, `device_memory_max_size`, `host_memory_initial_size` and `host_memory_max_size` were specified using the intended units (B, KB, MB, GB or TB).
+
+
 ## Miscellaneous
 
 **Q1: Can I use DLA cores with the Holoscan SDK?**
diff --git a/docs/inference.md b/docs/inference.md
index 293bf9ca..557bf1bb 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -108,6 +108,10 @@ Required parameters and related features available with the Holoscan Inference M
                     "model_2_unique_identifier": "torch"
                     "model_3_unique_identifier": "torch"
             ```
+    - `trt_opt_profile`: This parameter is optional and is activated with TensorRT backend. This parameter is applicable on models with dynamic input shapes.
+        - Parameter is specified as a vector of 3 integers. First is the minimum batch size for the input, second is the optimum batch size and third value is the maximum batch size.
+        - Users can specify a batch profile for dynamic input. This profile is then used in engine creation. User must clear the cache to apply the updated optimization profile.
+        - Default value: {1,1,1}
 
 - Other features: The table below illustrates other features and supported values in the current release.
 
diff --git a/docs/overview.md b/docs/overview.md
index 18d09283..6448dc6a 100644
--- a/docs/overview.md
+++ b/docs/overview.md
@@ -22,7 +22,7 @@ The Holoscan SDK provides a list of examples to illustrate specific capabilities
 
 5. **Repository of Operators and Applications**
 
-[HoloHub](https://nvidia-holoscan.github.io/holohub) is a central repository for users and developers to share reusable operators and sample applications with the Holoscan community. Being open-source, these operators and applications can also be used as reference implementations to complete the built-in operators and examples available in the SDK.
+[HoloHub](https://github.com/nvidia-holoscan/holohub) is a central repository for users and developers to share reusable operators and sample applications with the Holoscan community. Being open-source, these operators and applications can also be used as reference implementations to complete the built-in operators and examples available in the SDK.
 
 6. **Tooling to Package and Deploy Applications**
 
@@ -44,5 +44,5 @@ The Holoscan SDK documentation is composed of:
 - [Release notes](https://github.com/nvidia-holoscan/holoscan-sdk/releases) on Github
 
 :::{note}
-In previous releases, the prefix [`Clara`](https://developer.nvidia.com/industries/healthcare) was used to define Holoscan as a platform designed initially for [medical devices](https://www.nvidia.com/en-us/clara/developer-kits/). Starting with version 0.4.0, the Holoscan SDK is built to be domain-agnostic and can be used to build sensor AI applications in multiple domains. Domain specific content will be hosted on the [HoloHub](https://nvidia-holoscan.github.io/holohub) repository.
+In previous releases, the prefix [`Clara`](https://developer.nvidia.com/industries/healthcare) was used to define Holoscan as a platform designed initially for [medical devices](https://www.nvidia.com/en-us/clara/developer-kits/). Starting with version 0.4.0, the Holoscan SDK is built to be domain-agnostic and can be used to build sensor AI applications in multiple domains. Domain specific content will be hosted on the [HoloHub](https://github.com/nvidia-holoscan/holohub) repository.
 :::
diff --git a/docs/sdk_installation.md b/docs/sdk_installation.md
index eccc1fa9..2dab76f9 100644
--- a/docs/sdk_installation.md
+++ b/docs/sdk_installation.md
@@ -19,10 +19,11 @@ Developer Kit | User Guide | OS | GPU Mode
 ------------- | ---------- | --- | ---
 [NVIDIA IGX Orin][igx] | [Guide][igx-guide] | [IGX Software][igx-sw] 1.0 Production Release | iGPU **or*** dGPU
 [NVIDIA Jetson AGX Orin and Orin Nano][jetson-orin] | [Guide][jetson-guide] | [JetPack][jp] 6.0 | iGPU
-[NVIDIA Clara AGX][clara-agx]<br>_Only supporting the NGC container_ | [Guide][clara-guide] | [HoloPack][sdkm] 1.2 | iGPU **or*** dGPU
+[NVIDIA Clara AGX][clara-agx]<br>_Only supporting the NGC container_ | [Guide][clara-guide] | [HoloPack][sdkm] 1.2<br>_[Upgrade to 535+ drivers required][cagx-upgrade]_ | dGPU
 
 [clara-agx]: https://www.nvidia.com/en-gb/clara/intelligent-medical-instruments
 [clara-guide]: https://github.com/nvidia-holoscan/holoscan-docs/blob/main/devkits/clara-agx/clara_agx_user_guide.md
+[cagx-upgrade]: https://github.com/nvidia-holoscan/holoscan-docs/blob/main/devkits/clara-agx/clara_agx_user_guide.md#update-nvidia-drivers
 [sdkm]: https://developer.nvidia.com/drive/sdk-manager
 [igx]: https://www.nvidia.com/en-us/edge-computing/products/igx/
 [igx-guide]: https://developer.nvidia.com/igx-orin-developer-kit-user-guide
@@ -81,11 +82,11 @@ We provide multiple ways to install and run the Holoscan SDK:
 ````{tab-item} NGC Container
 - **dGPU** (x86_64, IGX Orin dGPU, Clara AGX dGPU, GH200)
    ```bash
-   docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.5.0-dgpu
+   docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.6.0-dgpu
    ```
 - **iGPU** (Jetson, IGX Orin iGPU, Clara AGX iGPU)
    ```bash
-   docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.5.0-igpu
+   docker pull nvcr.io/nvidia/clara-holoscan/holoscan:v2.6.0-igpu
    ```
 See details and usage instructions on [NGC][container].
 ````
@@ -198,7 +199,7 @@ For x86_64, ensure that the [CUDA Runtime is installed](https://developer.nvidia
 |  | NGC dev Container | Debian Package | Python Wheels |
 |---|:---:|:---:|:---:|
 | Runtime libraries | **Included** | **Included** | **Included** |
-| Python module | 3.10 | 3.10 | **3.8 to 3.11** |
+| Python module | 3.10 | 3.10 | **3.9 to 3.12** |
 | C++ headers and<br>CMake config | **Included** | **Included** | N/A |
 | Examples (+ source) | **Included** | **Included** | [retrieve from<br>GitHub][examples] |
 | Sample datasets | **Included** | [retrieve from<br>NGC][data] | [retrieve from<br>NGC][data] |
@@ -225,14 +226,14 @@ For x86_64, ensure that the [CUDA Runtime is installed](https://developer.nvidia
 [^1]: [CUDA 12](https://docs.nvidia.com/cuda/archive/12.1.1/cuda-installation-guide-linux/index.html) is required. Already installed on NVIDIA developer kits with IGX Software and JetPack.
 [^2]: Debian installation on x86_64 requires the [latest cuda-keyring package](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu) to automatically install all dependencies.
 [^3]: NPP 12 needed for the FormatConverter and BayerDemosaic operators. Already installed on NVIDIA developer kits with IGX Software and JetPack.
-[^4]: TensorRT 8.6.1+ and cuDNN needed for the Inference operator. Already installed on NVIDIA developer kits with IGX Software and JetPack.
+[^4]: TensorRT 10.3+ needed for the Inference operator. Already installed on NVIDIA developer kits with IGX Software and JetPack.
 [^5]: Vulkan 1.3.204+ loader needed for the HoloViz operator (+ libegl1 for headless rendering). Already installed on NVIDIA developer kits with IGX Software and JetPack.
 [^6]: V4L2 1.22+ needed for the V4L2 operator. Already installed on NVIDIA developer kits with IGX Software and JetPack.  V4L2 also requires libjpeg.
-[^7]: Torch support requires LibTorch 2.1+, TorchVision 0.16+, OpenBLAS 0.3.20+, OpenMPI (aarch64 only), MKL 2021.1.1 (x86_64 only), libpng and libjpeg.
+[^7]: Torch support requires LibTorch 2.5+, TorchVision 0.20+, OpenBLAS 0.3.20+, OpenMPI v4.1.7a1+, UCC 1.4+, MKL 2021.1.1 (x86_64 only), NVIDIA Performance Libraries (aarch64 dGPU only), libpng, and libjpeg. Note that container builds use OpenMPI and UCC originating from the NVIDIA HPC-X package bundle.
 [^8]: To install LibTorch and TorchVision, either build them from source, download our [pre-built packages](https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/), or copy them from the holoscan container (in `/opt`).
-[^9]: ONNXRuntime 1.15.1+ needed for the Inference operator. Note that ONNX models are also supported through the TensoRT backend of the Inference Operator.
+[^9]: ONNXRuntime 1.18.1+ needed for the Inference operator. Note that ONNX models are also supported through the TensorRT backend of the Inference Operator.
 [^10]: To install ONNXRuntime, either build it from source, download our [pre-built package](https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/) with CUDA 12 and TensoRT execution provider support, or copy it from the holoscan container (in `/opt/onnxruntime`).
-[^11]: Tested with MOFED 23.10
+[^11]: Tested with MOFED 24.07
 
 ### Need more control over the SDK?
 
diff --git a/docs/set_up_gpudirect_rdma.md b/docs/set_up_gpudirect_rdma.md
index 75823635..367ff8fa 100644
--- a/docs/set_up_gpudirect_rdma.md
+++ b/docs/set_up_gpudirect_rdma.md
@@ -30,7 +30,7 @@ If not installed, or an older version is installed, you should install the appro
 ```bash
 # You can choose different versions/OS or download directly from the
 # Download Center in the webpage linked above
-MOFED_VERSION="23.10-2.1.3.1"
+MOFED_VERSION="24.07-0.6.1.0"
 OS="ubuntu22.04"
 MOFED_PACKAGE="MLNX_OFED_LINUX-${MOFED_VERSION}-${OS}-$(uname -m)"
 wget --progress=dot:giga https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_PACKAGE}.tgz
diff --git a/docs/use_igpu_with_dgpu.md b/docs/use_igpu_with_dgpu.md
index 67772584..de2c4a0e 100644
--- a/docs/use_igpu_with_dgpu.md
+++ b/docs/use_igpu_with_dgpu.md
@@ -11,11 +11,21 @@ We provide utilities to work around the second conflict:
 
 `````{tab-set}
 ````{tab-item} IGX SW 1.0
-Refer to the [IGX user guide](https://docs.nvidia.com/igx-orin/user-guide/latest/igpu-dgpu.html) to learn how to leverage the iGPU in containers while the IGX developer kit is flashed in dGPU mode.
 
-To leverage both GPUs in Holoscan, you can create separate applications running concurrently per the IGX documentation above, where the iGPU application must run in the Holoscan iGPU container, and the dGPU application can run bare metal or in the Holoscan dGPU container.
+1. From an IGX developer kit flashed for dGPU, run the following command to enable iGPU container support:
 
-You can also create a single distributed application that leverages both the iGPU and dGPU by executing separate fragments on the iGPU and on the dGPU.
+   ```bash
+   sudo /opt/nvidia/l4t-igpu-container-on-dgpu-host-config/l4t-igpu-container-on-dgpu-host-config.sh configure
+   ```
+
+  Refer to the [IGX user guide][igx-igpu-dgpu] for details.
+
+2. To leverage both GPUs in Holoscan, you can either:
+
+   1. create separate Holoscan applications running concurrently, where the iGPU application must run in the Holoscan iGPU container, and the dGPU application can run bare metal or in the Holoscan dGPU container. Refer to the [IGX user guide][igx-igpu-dgpu] for details on how to launch a Holoscan container using the iGPU.
+   2. create a single distributed application that leverages both the iGPU and dGPU by executing separate fragments on the iGPU and on the dGPU.
+
+[igx-igpu-dgpu]: https://docs.nvidia.com/igx-orin/user-guide/latest/igpu-dgpu.html
 
 The example below shows the ping distributed application between the iGPU and dGPU using Holoscan containers:
 
@@ -24,7 +34,7 @@ COMMON_DOCKER_FLAGS="--rm -i --init --net=host
 --runtime=nvidia -e NVIDIA_DRIVER_CAPABILITIES=all
 --cap-add CAP_SYS_PTRACE --ipc=host --ulimit memlock=-1 --ulimit stack=67108864
 "
-HOLOSCAN_VERSION=2.2.0
+HOLOSCAN_VERSION=2.6.0
 HOLOSCAN_IMG="nvcr.io/nvidia/clara-holoscan/holoscan:v$HOLOSCAN_VERSION"
 HOLOSCAN_DGPU_IMG="$HOLOSCAN_IMG-dgpu"
 HOLOSCAN_IGPU_IMG="$HOLOSCAN_IMG-igpu"
@@ -45,13 +55,13 @@ docker run \
 docker run \
   $COMMON_DOCKER_FLAGS \
   -e NVIDIA_VISIBLE_DEVICES=nvidia.com/igpu=0 \
-  $HOLOSCAN_IMG-igpu \
+  $HOLOSCAN_IGPU_IMG \
   bash -c "./examples/ping_distributed/cpp/ping_distributed --gpu --worker"
 ```
 
 ````
 ````{tab-item} HoloPack 1.2+
-The [L4T Compute Assist](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara-holoscan/containers/l4t-compute-assist) is a container on NGC which isolates the iGPU stack in order to enable iGPU compute on the developer kits configured for dGPU. Other applications can run concurrently on the dGPU, natively or in another container.
+The [L4T Compute Assist](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara-holoscan/containers/l4t-compute-assist) is a container on NGC which isolates the iGPU stack by containing the L4T BSP packages in order to enable iGPU compute on the developer kits configured for dGPU. Other applications can run concurrently on the dGPU, natively or in another container.
 ````
 `````
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 41785517..e75207bc 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,42 +12,87 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+cmake_minimum_required(VERSION 3.20)
+project(Holoscan-examples NONE)
+
+# If we build outside of the SDK main build we set the default value to true
+if(NOT DEFINED HOLOSCAN_BUILD_PYTHON)
+  set(HOLOSCAN_BUILD_PYTHON ON)
+endif()
+
+option(HOLOSCAN_CPP_EXAMPLES "Build C++ examples" ON)
+option(HOLOSCAN_PYTHON_EXAMPLES "Build Python examples" ${HOLOSCAN_BUILD_PYTHON})
+
+# Check if the data target exists otherwise define it
+if(NOT TARGET racerx_data)
+  # RacerX sample data
+  add_library(racerx_data INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/../data/racerx/racerx-small.mp4")
+
+  # Setup the installation rule
+  install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/racerx DESTINATION data COMPONENT holoscan-data)
+
+  # Download the datasets
+  list(APPEND CMAKE_PROGRAM_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../scripts)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/HoloscanDownloadData.cmake)
+
+  set(RACERX_DATA_VERSION "20231009")
+  set(RACERX_DATA_MD5 "86cd7e5477bb9eaa0cfc0547912d77b2")
+
+  # Download the racerX sample data
+  holoscan_download_data(racerx
+      URL https://api.ngc.nvidia.com/v2/resources/nvidia/clara-holoscan/holoscan_racerx_video/versions/${RACERX_DATA_VERSION}/zip
+      URL_MD5 ${RACERX_DATA_MD5}
+      DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../data
+      BYPRODUCTS "${CMAKE_CURRENT_SOURCE_DIR}/../data/racerx/racerx-small.mp4"
+    )
+
+  # Add dependency to force the download
+  add_dependencies(racerx_data racerx_download)
+endif()
+
+# Enable testing
+include(CTest)
+
 if(HOLOSCAN_BUILD_AJA)
   add_subdirectory(aja_capture)
 endif()
+
+# C++ only examples
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(wrap_operator_as_gxf_extension)
+  add_subdirectory(ping_any)
+  add_subdirectory(ping_conditional)
+  add_subdirectory(ping_cycle)
+endif()
+
+# Python Only examples
+if(HOLOSCAN_PYTHON_EXAMPLES)
+  add_subdirectory(cupy_native)
+  add_subdirectory(numpy_native)
+  add_subdirectory(python_decorator)
+endif()
+
+# C++ and Python examples
 add_subdirectory(bring_your_own_model)
 add_subdirectory(cli_packager)
 add_subdirectory(conditions)
-if(HOLOSCAN_BUILD_PYTHON)
-  add_subdirectory(cupy_native)
-endif()
 add_subdirectory(flow_tracker)
 add_subdirectory(hello_world)
 add_subdirectory(holoviz)
 add_subdirectory(import_gxf_components)
 add_subdirectory(multithread)
 add_subdirectory(multi_branch_pipeline)
-if(HOLOSCAN_BUILD_PYTHON)
-  add_subdirectory(numpy_native)
-endif()
-add_subdirectory(ping_any)
-add_subdirectory(ping_conditional)
-add_subdirectory(ping_cycle)
 add_subdirectory(ping_simple)
 add_subdirectory(ping_simple_run_async)
 add_subdirectory(ping_custom_op)
 add_subdirectory(ping_multi_port)
 add_subdirectory(ping_distributed)
 add_subdirectory(ping_vector)
-if(HOLOSCAN_BUILD_PYTHON)
-  add_subdirectory(python_decorator)
-endif()
 add_subdirectory(resources)
 add_subdirectory(tensor_interop)
 add_subdirectory(v4l2_camera)
 add_subdirectory(video_replayer)
 add_subdirectory(video_replayer_distributed)
-add_subdirectory(wrap_operator_as_gxf_extension)
 
 if(HOLOSCAN_INSTALL_EXAMPLE_SOURCE)
 # Generates the install CMakeLists.txt to compile all the cpp examples
diff --git a/examples/README.md b/examples/README.md
index ca235981..23d45ed9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,7 +1,7 @@
 # Holoscan SDK Examples
 
 This directory contains examples to help users learn how to use the Holoscan SDK for development.
-See [HoloHub](https://nvidia-holoscan.github.io/holohub) to find additional reference applications.
+See [HoloHub](https://github.com/nvidia-holoscan/holohub) to find additional reference applications.
 
 ## Build instructions
 
diff --git a/examples/aja_capture/CMakeLists.txt b/examples/aja_capture/CMakeLists.txt
index be1771c0..85844c71 100644
--- a/examples/aja_capture/CMakeLists.txt
+++ b/examples/aja_capture/CMakeLists.txt
@@ -13,9 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
 
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/aja_capture/README.md b/examples/aja_capture/README.md
index 3eaeb5b1..e508c7bb 100644
--- a/examples/aja_capture/README.md
+++ b/examples/aja_capture/README.md
@@ -49,7 +49,7 @@ Minimal example to demonstrate the use of the aja source operator to capture dev
   ```
 ## Settings
 
- To evaluate the AJA example using alternative resolutions, you may modify the aja_capture.yaml configuration file as needed. For instance, to test a resolution format of 1280 x 720 at 50 Hz, you can specify the following parameters in the aja section of the configuration :
+ To evaluate the AJA example using alternative resolutions, you may modify the aja_capture.yaml configuration file as needed. For instance, to test a resolution format of 1280 x 720 at 60 Hz, you can specify the following parameters in the aja section of the configuration :
    
     ```bash
       aja:
diff --git a/examples/aja_capture/cpp/CMakeLists.txt b/examples/aja_capture/cpp/CMakeLists.txt
index 43fd05b4..19837196 100644
--- a/examples/aja_capture/cpp/CMakeLists.txt
+++ b/examples/aja_capture/cpp/CMakeLists.txt
@@ -74,7 +74,7 @@ if(HOLOSCAN_BUILD_TESTS AND TEST_AJA)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME video_replayer_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/aja_capture/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/aja_capture/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -87,7 +87,7 @@ if(HOLOSCAN_BUILD_TESTS AND TEST_AJA)
   add_custom_command(OUTPUT aja_capture_test.cpp
     PRE_LINK
     COMMAND patch -u -o aja_capture_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/aja_capture.cpp
-            ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/aja_capture/cpp_aja_capture.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/aja_capture/cpp_aja_capture.patch
   )
 
   # Create the test executable
@@ -124,7 +124,7 @@ if(HOLOSCAN_BUILD_TESTS AND TEST_AJA)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_CPP_AJA_CAPTURE_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/aja_capture/python/CMakeLists.txt b/examples/aja_capture/python/CMakeLists.txt
index 4bb41493..bb39ba6a 100644
--- a/examples/aja_capture/python/CMakeLists.txt
+++ b/examples/aja_capture/python/CMakeLists.txt
@@ -43,7 +43,7 @@ if(HOLOSCAN_BUILD_TESTS AND TEST_AJA)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_aja_capture_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/aja_capture/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/aja_capture/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -57,7 +57,7 @@ if(HOLOSCAN_BUILD_TESTS AND TEST_AJA)
   add_custom_command(OUTPUT aja_capture_test.py
     PRE_LINK
     COMMAND patch -u -o aja_capture_test.py ${CMAKE_CURRENT_SOURCE_DIR}/aja_capture.py
-        ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/aja_capture/python_aja_capture.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/aja_capture/python_aja_capture.patch
   )
 
   add_custom_target(python_aja_capture_test ALL
@@ -76,7 +76,7 @@ if(HOLOSCAN_BUILD_TESTS AND TEST_AJA)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_PYTHON_AJA_CAPTURE_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/bring_your_own_model/CMakeLists.txt b/examples/bring_your_own_model/CMakeLists.txt
index c1b6d378..ed9358bc 100644
--- a/examples/bring_your_own_model/CMakeLists.txt
+++ b/examples/bring_your_own_model/CMakeLists.txt
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 add_subdirectory(model)
diff --git a/examples/conditions/asynchronous/CMakeLists.txt b/examples/conditions/asynchronous/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/conditions/asynchronous/CMakeLists.txt
+++ b/examples/conditions/asynchronous/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/conditions/asynchronous/cpp/ping_async.cpp b/examples/conditions/asynchronous/cpp/ping_async.cpp
index 0e9f1f65..a6ba0c4a 100644
--- a/examples/conditions/asynchronous/cpp/ping_async.cpp
+++ b/examples/conditions/asynchronous/cpp/ping_async.cpp
@@ -60,7 +60,7 @@ class App : public holoscan::Application {
   bool async_transmit_ = false;
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, char** argv) {
   auto app = holoscan::make_application<App>();
 
   // Get the configuration
@@ -69,12 +69,12 @@ int main(int argc, char** argv) {
   app->config(config_path);
 
   // set customizable application parameters via the YAML
-  bool async_receive = app->from_config("async_receive").as<bool>();
-  bool async_transmit = app->from_config("async_transmit").as<bool>();
+  auto async_receive = app->from_config("async_receive").as<bool>();
+  auto async_transmit = app->from_config("async_transmit").as<bool>();
   app->set_async_receive(async_receive);
   app->set_async_transmit(async_transmit);
 
-  std::string scheduler = app->from_config("scheduler").as<std::string>();
+  auto scheduler = app->from_config("scheduler").as<std::string>();
   holoscan::ArgList scheduler_args{holoscan::Arg("stop_on_deadlock", true),
                                    holoscan::Arg("stop_on_deadlock_timeout", 500L)};
   if (scheduler == "multi_thread") {
diff --git a/examples/conditions/expiring_message/CMakeLists.txt b/examples/conditions/expiring_message/CMakeLists.txt
index a3ac1334..0fa63c4d 100644
--- a/examples/conditions/expiring_message/CMakeLists.txt
+++ b/examples/conditions/expiring_message/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/conditions/expiring_message/cpp/ping_expiring_message.cpp b/examples/conditions/expiring_message/cpp/ping_expiring_message.cpp
index e1ee40c6..1b585604 100644
--- a/examples/conditions/expiring_message/cpp/ping_expiring_message.cpp
+++ b/examples/conditions/expiring_message/cpp/ping_expiring_message.cpp
@@ -32,7 +32,8 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<std::shared_ptr<std::string>>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value =
         std::make_shared<std::string>(fmt::format("ExpiringMessageAvailable ping: {}", index_));
     ++index_;
@@ -40,9 +41,9 @@ class PingTxOp : public Operator {
     // retrieve the scheduler used for this application via it's fragment
     auto scheduler = fragment_->scheduler();
     // To get the clock we currently have to cast the scheduler to gxf::GXFScheduler.
-    // TODO: Refactor C++ lib so the clock method is on Scheduler rather than GXFScheduler.
-    //       That would allow us to avoid this dynamic_pointer_cast, but might require adding
-    //       renaming Clock->GXFClock and then adding a new holoscan::Clock independent of GXF.
+    // TODO(unknown): Refactor C++ lib so the clock method is on Scheduler rather than
+    //   GXFScheduler. That would allow us to avoid this dynamic_pointer_cast, but might require
+    //   adding renaming Clock->GXFClock and then adding a new holoscan::Clock independent of GXF.
     auto gxf_scheduler = std::dynamic_pointer_cast<gxf::GXFScheduler>(scheduler);
     auto clock = gxf_scheduler->clock();
     auto timestamp = clock->timestamp();
@@ -72,19 +73,21 @@ class PingRxOp : public Operator {
         .condition(ConditionType::kExpiringMessageAvailable, expiring_message_arglist);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
-    auto in_value = op_input.receive<std::shared_ptr<std::string>>("in");
-
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     HOLOSCAN_LOG_INFO("PingRxOp::compute() called");
 
-    while (in_value) {
+    while (true) {
+      auto in_value = op_input.receive<std::shared_ptr<std::string>>("in");
+
+      if (!in_value) { break; }
+
       auto message = in_value.value();
       if (message) {
         HOLOSCAN_LOG_INFO("Rx message received: {}", message->c_str());
       } else {
         HOLOSCAN_LOG_INFO("Rx message received: nullptr");
       }
-      in_value = op_input.receive<std::shared_ptr<std::string>>("in");
     }
   };
 };
@@ -111,7 +114,7 @@ class App : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, char** argv) {
   auto app = holoscan::make_application<App>();
 
   // Get the configuration
diff --git a/examples/conditions/periodic/CMakeLists.txt b/examples/conditions/periodic/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/conditions/periodic/CMakeLists.txt
+++ b/examples/conditions/periodic/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/conditions/periodic/cpp/ping_periodic.cpp b/examples/conditions/periodic/cpp/ping_periodic.cpp
index 02e51002..81131e3e 100644
--- a/examples/conditions/periodic/cpp/ping_periodic.cpp
+++ b/examples/conditions/periodic/cpp/ping_periodic.cpp
@@ -30,7 +30,8 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<std::shared_ptr<std::string>>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = std::make_shared<std::string>("Periodic ping...");
     op_output.emit(value, "out");
   };
@@ -44,7 +45,8 @@ class PingRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.input<std::shared_ptr<std::string>>("in"); }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto in_value = op_input.receive<std::shared_ptr<std::string>>("in").value();
 
     HOLOSCAN_LOG_INFO("Rx message received: {}", in_value->c_str());
@@ -77,7 +79,7 @@ class App : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, char** argv) {
   auto app = holoscan::make_application<App>();
 
   // Get the configuration
diff --git a/examples/flow_tracker/CMakeLists.txt b/examples/flow_tracker/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/flow_tracker/CMakeLists.txt
+++ b/examples/flow_tracker/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/flow_tracker/cpp/flow_tracker.cpp b/examples/flow_tracker/cpp/flow_tracker.cpp
index b3859479..b97dc0df 100644
--- a/examples/flow_tracker/cpp/flow_tracker.cpp
+++ b/examples/flow_tracker/cpp/flow_tracker.cpp
@@ -28,9 +28,21 @@ class ValueData {
   }
   ~ValueData() { HOLOSCAN_LOG_TRACE("ValueData::~ValueData(): {}", data_); }
 
+  // Use default copy constructor
+  ValueData(const ValueData&) = default;
+
+  // Use default move constructor
+  ValueData(ValueData&&) noexcept = default;
+
+  // Use default copy assignment operator
+  ValueData& operator=(const ValueData&) = default;
+
+  // Use default move assignment operator
+  ValueData& operator=(ValueData&&) noexcept = default;
+
   void data(int value) { data_ = value; }
 
-  int data() const { return data_; }
+  [[nodiscard]] int data() const { return data_; }
 
  private:
   int data_;
@@ -46,10 +58,13 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<ValueData>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = ValueData(index_++);
     op_output.emit(value1, "out");
   };
+
+ private:
   int index_ = 1;
 };
 
@@ -65,7 +80,8 @@ class PingMxOp : public Operator {
     spec.param(multiplier_, "multiplier", "Multiplier", "Multiply the input by this value", 2);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<ValueData>("in").value();
 
     HOLOSCAN_LOG_INFO("Middle message received (count: {})", count_++);
@@ -97,7 +113,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<ValueData>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value_vector = op_input.receive<std::vector<ValueData>>("receivers").value();
 
     HOLOSCAN_LOG_INFO("Rx message received (count: {}, size: {})", count_++, value_vector.size());
@@ -138,7 +155,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   // Skip 2 messages at the start and 3 messages at the end
   auto& tracker = app->track(2, 3, 0);
diff --git a/examples/hello_world/CMakeLists.txt b/examples/hello_world/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/hello_world/CMakeLists.txt
+++ b/examples/hello_world/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/hello_world/cpp/hello_world.cpp b/examples/hello_world/cpp/hello_world.cpp
index 38e943b9..8c7189c5 100644
--- a/examples/hello_world/cpp/hello_world.cpp
+++ b/examples/hello_world/cpp/hello_world.cpp
@@ -28,8 +28,8 @@ class HelloWorldOp : public Operator {
 
   void setup(OperatorSpec& spec) override {}
 
-  void compute(InputContext& op_input, OutputContext& op_output,
-               ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     std::cout << std::endl;
     std::cout << "Hello World!" << std::endl;
     std::cout << std::endl;
@@ -51,7 +51,7 @@ class HelloWorldApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<HelloWorldApp>();
   app->run();
 
diff --git a/examples/holoviz/CMakeLists.txt b/examples/holoviz/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/holoviz/CMakeLists.txt
+++ b/examples/holoviz/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/holoviz/cpp/CMakeLists.txt b/examples/holoviz/cpp/CMakeLists.txt
index 6c555483..9b45a344 100644
--- a/examples/holoviz/cpp/CMakeLists.txt
+++ b/examples/holoviz/cpp/CMakeLists.txt
@@ -70,12 +70,12 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME holoviz_geometry_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_geometry/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_geometry/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
   # Patch the current example to enable recording the rendering window
-  set(_patch_file ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_geometry/cpp_holoviz_geometry.patch)
+  set(_patch_file ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_geometry/cpp_holoviz_geometry.patch)
   add_custom_command(OUTPUT holoviz_geometry_test.cpp
     PRE_LINK
     COMMAND patch -u -o holoviz_geometry_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/holoviz_geometry.cpp
@@ -116,7 +116,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_CPP_HOLOVIZ_GEOMETRY_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/holoviz/cpp/holoviz_camera.cpp b/examples/holoviz/cpp/holoviz_camera.cpp
index 1c9d27f7..4b90238e 100644
--- a/examples/holoviz/cpp/holoviz_camera.cpp
+++ b/examples/holoviz/cpp/holoviz_camera.cpp
@@ -14,6 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+// NOLINTFILE(concurrency-mt-unsafe)
+
 #include <getopt.h>
 
 #include <chrono>
@@ -42,7 +44,8 @@ class CameraPoseRxOp : public Operator {
 
   void start() override { start_time_ = std::chrono::steady_clock::now(); }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<std::shared_ptr<nvidia::gxf::Pose3D>>("input").value();
 
     // print once every second
@@ -106,7 +109,7 @@ class GeometrySourceOp : public Operator {
     std::memcpy(tensor->pointer(), data.data(), N * C * sizeof(float));
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output,
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
                ExecutionContext& context) override {
     auto entity = gxf::Entity::New(&context);
     auto specs = std::vector<HolovizOp::InputSpec>();
@@ -115,57 +118,57 @@ class GeometrySourceOp : public Operator {
     // Each triangle is defined by a set of 3 (x, y, z) coordinate pairs.
     add_data<6, 3>(entity,
                    "back",
-                   {{{-1.f, -1.f, -1.f},
-                     {1.f, -1.f, -1.f},
-                     {1.f, 1.f, -1.f},
-                     {1.f, 1.f, -1.f},
-                     {-1.f, 1.f, -1.f},
-                     {-1.f, -1.f, -1.f}}},
+                   {{{-1.F, -1.F, -1.F},
+                     {1.F, -1.F, -1.F},
+                     {1.F, 1.F, -1.F},
+                     {1.F, 1.F, -1.F},
+                     {-1.F, 1.F, -1.F},
+                     {-1.F, -1.F, -1.F}}},
                    context);
     add_data<6, 3>(entity,
                    "front",
-                   {{{-1.f, -1.f, 1.f},
-                     {1.f, -1.f, 1.f},
-                     {1.f, 1.f, 1.f},
-                     {1.f, 1.f, 1.f},
-                     {-1.f, 1.f, 1.f},
-                     {-1.f, -1.f, 1.f}}},
+                   {{{-1.F, -1.F, 1.F},
+                     {1.F, -1.F, 1.F},
+                     {1.F, 1.F, 1.F},
+                     {1.F, 1.F, 1.F},
+                     {-1.F, 1.F, 1.F},
+                     {-1.F, -1.F, 1.F}}},
                    context);
     add_data<6, 3>(entity,
                    "right",
-                   {{{1.f, -1.f, -1.f},
-                     {1.f, -1.f, 1.f},
-                     {1.f, 1.f, 1.f},
-                     {1.f, 1.f, 1.f},
-                     {1.f, 1.f, -1.f},
-                     {1.f, -1.f, -1.f}}},
+                   {{{1.F, -1.F, -1.F},
+                     {1.F, -1.F, 1.F},
+                     {1.F, 1.F, 1.F},
+                     {1.F, 1.F, 1.F},
+                     {1.F, 1.F, -1.F},
+                     {1.F, -1.F, -1.F}}},
                    context);
     add_data<6, 3>(entity,
                    "left",
-                   {{{-1.f, -1.f, -1.f},
-                     {-1.f, -1.f, 1.f},
-                     {-1.f, 1.f, 1.f},
-                     {-1.f, 1.f, 1.f},
-                     {-1.f, 1.f, -1.f},
-                     {-1.f, -1.f, -1.f}}},
+                   {{{-1.F, -1.F, -1.F},
+                     {-1.F, -1.F, 1.F},
+                     {-1.F, 1.F, 1.F},
+                     {-1.F, 1.F, 1.F},
+                     {-1.F, 1.F, -1.F},
+                     {-1.F, -1.F, -1.F}}},
                    context);
     add_data<6, 3>(entity,
                    "top",
-                   {{{-1.f, 1.f, -1.f},
-                     {-1.f, 1.f, 1.f},
-                     {1.f, 1.f, 1.f},
-                     {1.f, 1.f, 1.f},
-                     {1.f, 1.f, -1.f},
-                     {-1.f, 1.f, -1.f}}},
+                   {{{-1.F, 1.F, -1.F},
+                     {-1.F, 1.F, 1.F},
+                     {1.F, 1.F, 1.F},
+                     {1.F, 1.F, 1.F},
+                     {1.F, 1.F, -1.F},
+                     {-1.F, 1.F, -1.F}}},
                    context);
     add_data<6, 3>(entity,
                    "bottom",
-                   {{{-1.f, -1.f, -1.f},
-                     {-1.f, -1.f, 1.f},
-                     {1.f, -1.f, 1.f},
-                     {1.f, -1.f, 1.f},
-                     {1.f, -1.f, -1.f},
-                     {-1.f, -1.f, -1.f}}},
+                   {{{-1.F, -1.F, -1.F},
+                     {-1.F, -1.F, 1.F},
+                     {1.F, -1.F, 1.F},
+                     {1.F, -1.F, 1.F},
+                     {1.F, -1.F, -1.F},
+                     {-1.F, -1.F, -1.F}}},
                    context);
 
     // emit the tensors
@@ -173,10 +176,17 @@ class GeometrySourceOp : public Operator {
 
     // every second, switch camera
     if (std::chrono::steady_clock::now() - start_time_ > std::chrono::seconds(1)) {
+      // NOLINTBEGIN(cert-msc30-c,cert-msc50-cpp,concurrency-mt-unsafe)
+      // NOLINTBEGIN(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
       const int camera = std::rand() % sizeof(cameras_) / sizeof(cameras_[0]);
+      // NOLINTEND(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+      // NOLINTEND(cert-msc30-c,cert-msc50-cpp,concurrency-mt-unsafe)
+
+      // NOLINTBEGIN(cppcoreguidelines-pro-bounds-constant-array-index)
       camera_eye_ = cameras_[camera][0];
       camera_look_at_ = cameras_[camera][1];
       camera_up_ = cameras_[camera][2];
+      // NOLINTEND(cppcoreguidelines-pro-bounds-constant-array-index)
 
       op_output.emit(camera_eye_, "camera_eye_output");
       op_output.emit(camera_look_at_, "camera_look_at_output");
@@ -195,13 +205,16 @@ class GeometrySourceOp : public Operator {
 
   std::chrono::steady_clock::time_point start_time_;
 
+  // NOLINTBEGIN(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+
   // define some cameras we switch between
   static constexpr std::array<float, 3> cameras_[4][3]{
-      {{0.f, 0.f, 5.f}, {1.f, 1.f, 0.f}, {0.f, 1.f, 0.f}},
-      {{1.f, 1.f, -3.f}, {0.f, 0.f, 0.f}, {0.f, 1.f, 0.f}},
-      {{3.f, -4.f, 0.f}, {0.f, 1.f, 1.f}, {1.f, 0.f, 0.f}},
-      {{-2.f, 0.f, -3.f}, {-1.f, 0.f, -1.f}, {0.f, 0.f, 1.f}}};
+      {{0.F, 0.F, 5.F}, {1.F, 1.F, 0.F}, {0.F, 1.F, 0.F}},
+      {{1.F, 1.F, -3.F}, {0.F, 0.F, 0.F}, {0.F, 1.F, 0.F}},
+      {{3.F, -4.F, 0.F}, {0.F, 1.F, 1.F}, {1.F, 0.F, 0.F}},
+      {{-2.F, 0.F, -3.F}, {-1.F, 0.F, -1.F}, {0.F, 0.F, 1.F}}};
 
+  // NOLINTEND(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
   std::array<float, 3> camera_eye_ = cameras_[0][0];
   std::array<float, 3> camera_look_at_ = cameras_[0][1];
   std::array<float, 3> camera_up_ = cameras_[0][2];
@@ -250,17 +263,21 @@ class HolovizCameraApp : public holoscan::Application {
 
     // Parameters defining the triangle primitives
     const std::array<const char*, 6> spec_names{"back", "front", "left", "right", "top", "bottom"};
-    for (int index = 0; index < spec_names.size(); ++index) {
+    unsigned int index = 0;
+    for (const auto* spec_name : spec_names) {
       auto& spec = input_spec.emplace_back(
-          ops::HolovizOp::InputSpec(spec_names[index], ops::HolovizOp::InputType::TRIANGLES_3D));
-      spec.color_ = {
-          float((index + 1) & 1), float(((index + 1) / 2) & 1), float(((index + 1) / 4) & 1), 1.0f};
+          ops::HolovizOp::InputSpec(spec_name, ops::HolovizOp::InputType::TRIANGLES_3D));
+      spec.color_ = {static_cast<float>((index + 1) & 1U),
+                     static_cast<float>(((index + 1) / 2) & 1U),
+                     static_cast<float>(((index + 1) / 4) & 1U),
+                     1.0F};
+      index++;
     }
 
     auto visualizer = make_operator<ops::HolovizOp>(
         "holoviz",
-        Arg("width", 1024u),
-        Arg("height", 1024u),
+        Arg("width", 1024U),
+        Arg("height", 1024U),
         Arg("tensors", input_spec),
         Arg("enable_camera_pose_output", true),
         Arg("camera_pose_output_type", std::string("extrinsics_model")),
@@ -285,16 +302,21 @@ class HolovizCameraApp : public holoscan::Application {
 
 int main(int argc, char** argv) {
   // Parse args
-  struct option long_options[] = {
-      {"help", no_argument, 0, 'h'}, {"count", required_argument, 0, 'c'}, {0, 0, 0, 0}};
+  // NOLINTBEGIN(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+  struct option long_options[] = {{"help", no_argument, nullptr, 'h'},
+                                  {"count", required_argument, nullptr, 'c'},
+                                  {nullptr, 0, nullptr, 0}};
+  // NOLINTEND(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
   uint64_t count = -1;
   while (true) {
     int option_index = 0;
-    const int c = getopt_long(argc, argv, "hc:", long_options, &option_index);
+    // NOLINTBEGIN(concurrency-mt-unsafe)
+    const int c = getopt_long(argc, argv, "hc:", static_cast<option*>(long_options), &option_index);
+    // NOLINTEND(concurrency-mt-unsafe)
 
     if (c == -1) { break; }
 
-    const std::string argument(optarg ? optarg : "");
+    const std::string argument(optarg != nullptr ? optarg : "");
     switch (c) {
       case 'h':
       case '?':
@@ -313,7 +335,7 @@ int main(int argc, char** argv) {
         count = std::stoull(argument);
         break;
       default:
-        throw std::runtime_error(fmt::format("Unhandled option `{}`", char(c)));
+        throw std::runtime_error(fmt::format("Unhandled option `{}`", static_cast<char>(c)));
     }
   }
 
diff --git a/examples/holoviz/cpp/holoviz_geometry.cpp b/examples/holoviz/cpp/holoviz_geometry.cpp
index 53fa7df9..83c9c860 100644
--- a/examples/holoviz/cpp/holoviz_geometry.cpp
+++ b/examples/holoviz/cpp/holoviz_geometry.cpp
@@ -74,7 +74,7 @@ class GeometrySourceOp : public Operator {
     std::memcpy(tensor->pointer(), data.data(), N * C * sizeof(float));
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output,
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
                ExecutionContext& context) override {
     auto entity = gxf::Entity::New(&context);
     auto specs = std::vector<HolovizOp::InputSpec>();
@@ -91,14 +91,14 @@ class GeometrySourceOp : public Operator {
     // box: (x1, y1), (x2, y2).
     add_data<8, 2>(entity,
                    "boxes",
-                   {{{0.1f, 0.2f},
-                     {0.8f, 0.5f},
-                     {0.2f, 0.4f},
-                     {0.3f, 0.6f},
-                     {0.3f, 0.5f},
-                     {0.4f, 0.7f},
-                     {0.5f, 0.7f},
-                     {0.6f, 0.9f}}},
+                   {{{0.1F, 0.2F},
+                     {0.8F, 0.5F},
+                     {0.2F, 0.4F},
+                     {0.3F, 0.6F},
+                     {0.3F, 0.5F},
+                     {0.4F, 0.7F},
+                     {0.5F, 0.7F},
+                     {0.6F, 0.9F}}},
                    context);
 
     /////////////////////////////////////////
@@ -107,19 +107,19 @@ class GeometrySourceOp : public Operator {
     // Each triangle is defined by a set of 3 (x, y) coordinate pairs.
     add_data<6, 2>(entity,
                    "triangles",
-                   {{{0.1f, 0.8f},
-                     {0.18f, 0.75f},
-                     {0.14f, 0.66f},
-                     {0.3f, 0.8f},
-                     {0.38f, 0.75f},
-                     {0.34f, 0.56f}}},
+                   {{{0.1F, 0.8F},
+                     {0.18F, 0.75F},
+                     {0.14F, 0.66F},
+                     {0.3F, 0.8F},
+                     {0.38F, 0.75F},
+                     {0.34F, 0.56F}}},
                    context);
 
     ///////////////////////////////////////
     // Create a tensor defining two crosses
     ///////////////////////////////////////
     // Each cross is defined by an (x, y, size) 3-tuple
-    add_data<2, 3>(entity, "crosses", {{{0.25f, 0.25f, 0.05f}, {0.75f, 0.25f, 0.10f}}}, context);
+    add_data<2, 3>(entity, "crosses", {{{0.25F, 0.25F, 0.05F}, {0.75F, 0.25F, 0.10F}}}, context);
 
     ///////////////////////////////////////
     // Create a tensor defining three ovals
@@ -127,9 +127,9 @@ class GeometrySourceOp : public Operator {
     // Each oval is defined by an (x, y, size_x, size_y) 4-tuple
     add_data<3, 4>(entity,
                    "ovals",
-                   {{{0.25f, 0.65f, 0.10f, 0.05f},
-                     {0.25f, 0.65f, 0.10f, 0.05f},
-                     {0.75f, 0.65f, 0.05f, 0.10f}}},
+                   {{{0.25F, 0.65F, 0.10F, 0.05F},
+                     {0.25F, 0.65F, 0.10F, 0.05F},
+                     {0.75F, 0.65F, 0.05F, 0.10F}}},
                    context);
 
     ////////////////////////////////////////
@@ -138,12 +138,14 @@ class GeometrySourceOp : public Operator {
     // Set of (x, y) points with 50 points equally spaced along x whose y
     // coordinate varies sinusoidally over time.
     constexpr uint32_t POINTS = 50;
-    constexpr float PI = 3.14f;
-    std::array<std::array<float, 2>, POINTS> point_coords;
-    for (uint32_t i = 0; i < POINTS; ++i) {
-      point_coords[i][0] = (1.f / POINTS) * i;
-      point_coords[i][1] =
-          0.8f + 0.1f * std::sin(8.f * PI * point_coords[i][0] + count_ / 60.f * 2.f * PI);
+    constexpr float PI = 3.14F;
+    std::array<std::array<float, 2>, POINTS> point_coords{};
+    uint32_t i = 0;
+    for (auto& point : point_coords) {
+      point[0] = static_cast<float>(i) / static_cast<float>(POINTS);
+      point[1] = 0.8F + 0.1F * std::sin(8.F * PI * point[0] +
+                                        static_cast<float>(count_) / 60.F * 2.F * PI);
+      i++;
     }
 
     add_data(entity, "points", point_coords, context);
@@ -152,13 +154,13 @@ class GeometrySourceOp : public Operator {
     // Create a tensor for "label_coords"
     /////////////////////////////////////
     // Set of two (x, y) points marking the location of text labels
-    add_data<2, 2>(entity, "label_coords", {{{0.10f, 0.1f}, {0.70f, 0.1f}}}, context);
+    add_data<2, 2>(entity, "label_coords", {{{0.10F, 0.1F}, {0.70F, 0.1F}}}, context);
 
     /////////////////////////////////////
     // Create a tensor for "dynamic_text"
     /////////////////////////////////////
     // Set of two (x, y) points marking the location of text labels
-    add_data<2, 2>(entity, "dynamic_text", {{{0.f, 0.f}}}, context);
+    add_data<2, 2>(entity, "dynamic_text", {{{0.F, 0.F}}}, context);
 
     // emit the tensors
     op_output.emit(entity, "outputs");
@@ -217,7 +219,7 @@ class HolovizGeometryApp : public holoscan::Application {
     using namespace holoscan;
 
     ArgList args;
-    auto data_directory = std::getenv("HOLOSCAN_INPUT_PATH");
+    auto* data_directory = std::getenv("HOLOSCAN_INPUT_PATH");  // NOLINT(*)
     if (data_directory != nullptr && data_directory[0] != '\0') {
       auto video_directory = std::filesystem::path(data_directory);
       video_directory /= "racerx";
@@ -230,7 +232,7 @@ class HolovizGeometryApp : public holoscan::Application {
         make_operator<ops::VideoStreamReplayerOp>("replayer",
                                                   Arg("directory", std::string("../data/racerx")),
                                                   Arg("basename", std::string("racerx")),
-                                                  Arg("frame_rate", 0.f),
+                                                  Arg("frame_rate", 0.F),
                                                   Arg("repeat", true),
                                                   Arg("realtime", true),
                                                   Arg("count", count_),
@@ -244,15 +246,15 @@ class HolovizGeometryApp : public holoscan::Application {
 
     auto& video_spec =
         input_spec.emplace_back(ops::HolovizOp::InputSpec("", ops::HolovizOp::InputType::COLOR));
-    video_spec.line_width_ = 2.f;
-    video_spec.opacity_ = 0.5f;
+    video_spec.line_width_ = 2.F;
+    video_spec.opacity_ = 0.5F;
     video_spec.priority_ = priority++;
 
     // Parameters defining the rectangle primitives
     auto& boxes_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("boxes", ops::HolovizOp::InputType::RECTANGLES));
-    boxes_spec.line_width_ = 2.f;
-    boxes_spec.color_ = {1.0f, 0.0f, 1.0f, 0.5f};
+    boxes_spec.line_width_ = 2.F;
+    boxes_spec.color_ = {1.0F, 0.0F, 1.0F, 0.5F};
     boxes_spec.priority_ = priority++;
 
     // line strip reuses the rectangle coordinates. This will make
@@ -260,55 +262,55 @@ class HolovizGeometryApp : public holoscan::Application {
     // each box.
     auto& line_strip_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("boxes", ops::HolovizOp::InputType::LINE_STRIP));
-    line_strip_spec.line_width_ = 3.f;
-    line_strip_spec.color_ = {0.4f, 0.4f, 1.0f, 0.7f};
+    line_strip_spec.line_width_ = 3.F;
+    line_strip_spec.color_ = {0.4F, 0.4F, 1.0F, 0.7F};
     line_strip_spec.priority_ = priority++;
 
     // Lines also reuses the boxes coordinates so will plot a set of
     // disconnected line segments along the box diagonals.
     auto& lines_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("boxes", ops::HolovizOp::InputType::LINES));
-    lines_spec.line_width_ = 3.f;
-    lines_spec.color_ = {0.4f, 1.0f, 0.4f, 0.7f};
+    lines_spec.line_width_ = 3.F;
+    lines_spec.color_ = {0.4F, 1.0F, 0.4F, 0.7F};
     lines_spec.priority_ = priority++;
 
     // Parameters defining the triangle primitives
     auto& triangles_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("triangles", ops::HolovizOp::InputType::TRIANGLES));
-    triangles_spec.color_ = {1.0f, 0.0f, 0.0f, 0.5f};
+    triangles_spec.color_ = {1.0F, 0.0F, 0.0F, 0.5F};
     triangles_spec.priority_ = priority++;
 
     // Parameters defining the crosses primitives
     auto& crosses_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("crosses", ops::HolovizOp::InputType::CROSSES));
-    crosses_spec.line_width_ = 3.f;
-    crosses_spec.color_ = {0.0f, 1.0f, 0.0f, 1.0f};
+    crosses_spec.line_width_ = 3.F;
+    crosses_spec.color_ = {0.0F, 1.0F, 0.0F, 1.0F};
     crosses_spec.priority_ = priority++;
 
     // Parameters defining the ovals primitives
     auto& ovals_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("ovals", ops::HolovizOp::InputType::OVALS));
-    ovals_spec.opacity_ = 0.5f;
-    ovals_spec.line_width_ = 2.f;
-    ovals_spec.color_ = {1.0f, 1.0f, 1.0f, 1.0f};
+    ovals_spec.opacity_ = 0.5F;
+    ovals_spec.line_width_ = 2.F;
+    ovals_spec.color_ = {1.0F, 1.0F, 1.0F, 1.0F};
     ovals_spec.priority_ = priority++;
 
     // Parameters defining the points primitives
     auto& points_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("points", ops::HolovizOp::InputType::POINTS));
-    points_spec.point_size_ = 4.f;
-    points_spec.color_ = {1.0f, 1.0f, 1.0f, 1.0f};
+    points_spec.point_size_ = 4.F;
+    points_spec.color_ = {1.0F, 1.0F, 1.0F, 1.0F};
     points_spec.priority_ = priority++;
 
     // Parameters defining the label_coords primitives
     auto& label_coords_spec = input_spec.emplace_back(
         ops::HolovizOp::InputSpec("label_coords", ops::HolovizOp::InputType::TEXT));
-    label_coords_spec.color_ = {1.0f, 1.0f, 1.0f, 1.0f};
+    label_coords_spec.color_ = {1.0F, 1.0F, 1.0F, 1.0F};
     label_coords_spec.text_ = {"label_1", "label_2"};
     label_coords_spec.priority_ = priority++;
 
     auto visualizer = make_operator<ops::HolovizOp>(
-        "holoviz", Arg("width", 854u), Arg("height", 480u), Arg("tensors", input_spec));
+        "holoviz", Arg("width", 854U), Arg("height", 480U), Arg("tensors", input_spec));
 
     // Define the workflow: source -> holoviz
     add_flow(source, visualizer, {{"outputs", "receivers"}});
@@ -322,16 +324,21 @@ class HolovizGeometryApp : public holoscan::Application {
 
 int main(int argc, char** argv) {
   // Parse args
-  struct option long_options[] = {
-      {"help", no_argument, 0, 'h'}, {"count", required_argument, 0, 'c'}, {0, 0, 0, 0}};
+  // NOLINTBEGIN(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+  struct option long_options[] = {{"help", no_argument, nullptr, 'h'},
+                                  {"count", required_argument, nullptr, 'c'},
+                                  {nullptr, 0, nullptr, 0}};
+  // NOLINTEND(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
   uint64_t count = 0;
   while (true) {
     int option_index = 0;
-    const int c = getopt_long(argc, argv, "hc:", long_options, &option_index);
+    // NOLINTBEGIN(concurrency-mt-unsafe)
+    const int c = getopt_long(argc, argv, "hc:", static_cast<option*>(long_options), &option_index);
+    // NOLINTEND(concurrency-mt-unsafe)
 
     if (c == -1) { break; }
 
-    const std::string argument(optarg ? optarg : "");
+    const std::string argument(optarg != nullptr ? optarg : "");
     switch (c) {
       case 'h':
         std::cout << "Usage: " << argv[0] << " [options]" << std::endl
diff --git a/examples/holoviz/python/CMakeLists.txt b/examples/holoviz/python/CMakeLists.txt
index ffb99c7f..05f69832 100644
--- a/examples/holoviz/python/CMakeLists.txt
+++ b/examples/holoviz/python/CMakeLists.txt
@@ -36,7 +36,7 @@ install(FILES
 if(HOLOSCAN_BUILD_TESTS)
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_holoviz_geometry_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_geometry/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_geometry/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -44,7 +44,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT holoviz_geometry_test.py
     PRE_LINK
     COMMAND patch -u -o holoviz_geometry_test.py ${CMAKE_CURRENT_SOURCE_DIR}/holoviz_geometry.py
-        ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_geometry/python_holoviz_geometry.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_geometry/python_holoviz_geometry.patch
     COMMAND sed -i "s#RECORDING_DIR#${RECORDING_DIR}#g" holoviz_geometry_test.py
     COMMAND sed -i "s#SOURCE_VIDEO_BASENAME#${SOURCE_VIDEO_BASENAME}#g" holoviz_geometry_test.py
   )
@@ -65,7 +65,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
     # Add a test to check the validity of the frames
     add_test(NAME EXAMPLE_PYTHON_HOLOVIZ_GEOMETRY_RENDER_TEST
-    COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+    COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
     --source_video_dir ${RECORDING_DIR}
     --source_video_basename ${SOURCE_VIDEO_BASENAME}
     --output_dir ${RECORDING_DIR}
@@ -107,7 +107,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_holoviz_geometry_3d_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_geometry_3d/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_geometry_3d/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -115,7 +115,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT holoviz_geometry_3d_test.py
     PRE_LINK
     COMMAND patch -u -o holoviz_geometry_3d_test.py ${CMAKE_CURRENT_SOURCE_DIR}/holoviz_geometry_3d.py
-        ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_geometry_3d/python_holoviz_geometry_3d.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_geometry_3d/python_holoviz_geometry_3d.patch
     COMMAND sed -i "s#RECORDING_DIR#${RECORDING_DIR}#g" holoviz_geometry_3d_test.py
     COMMAND sed -i "s#SOURCE_VIDEO_BASENAME#${SOURCE_VIDEO_BASENAME}#g" holoviz_geometry_3d_test.py
   )
@@ -135,7 +135,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
     # Add a test to check the validity of the frames
     add_test(NAME EXAMPLE_PYTHON_HOLOVIZ_GEOMETRY_3D_RENDER_TEST
-    COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+    COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
     --source_video_dir ${RECORDING_DIR}
     --source_video_basename ${SOURCE_VIDEO_BASENAME}
     --output_dir ${RECORDING_DIR}
@@ -170,7 +170,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_holoviz_views_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_views/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_views/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -178,7 +178,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT holoviz_views_test.py
     PRE_LINK
     COMMAND patch -u -o holoviz_views_test.py ${CMAKE_CURRENT_SOURCE_DIR}/holoviz_views.py
-        ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/holoviz_views/python_holoviz_views.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/holoviz_views/python_holoviz_views.patch
     COMMAND sed -i "s#RECORDING_DIR#${RECORDING_DIR}#g" holoviz_views_test.py
     COMMAND sed -i "s#SOURCE_VIDEO_BASENAME#${SOURCE_VIDEO_BASENAME}#g" holoviz_views_test.py
   )
@@ -198,7 +198,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
     # Add a test to check the validity of the frames
     add_test(NAME EXAMPLE_PYTHON_HOLOVIZ_VIEWS_RENDER_TEST
-    COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+    COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
     --source_video_dir ${RECORDING_DIR}
     --source_video_basename ${SOURCE_VIDEO_BASENAME}
     --output_dir ${RECORDING_DIR}
diff --git a/examples/import_gxf_components/CMakeLists.txt b/examples/import_gxf_components/CMakeLists.txt
index a3ac1334..0fa63c4d 100644
--- a/examples/import_gxf_components/CMakeLists.txt
+++ b/examples/import_gxf_components/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/import_gxf_components/cpp/import_gxf_components.cpp b/examples/import_gxf_components/cpp/import_gxf_components.cpp
index 5756c165..961d84ca 100644
--- a/examples/import_gxf_components/cpp/import_gxf_components.cpp
+++ b/examples/import_gxf_components/cpp/import_gxf_components.cpp
@@ -35,6 +35,7 @@
 // macro.
 #include "holoscan/operators/gxf_codelet/gxf_codelet.hpp"
 
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
 #ifdef CUDA_TRY
 #undef CUDA_TRY
 #define CUDA_TRY(stmt)                                                                  \
@@ -50,6 +51,7 @@
     }                                                                                   \
   }
 #endif
+// NOLINTEND(cppcoreguidelines-macro-usage)
 
 // Define an operator that wraps the GXF Codelet that sends a tensor
 // (`nvidia::gxf::test::SendTensor` class in send_tensor_gxf.hpp)
@@ -129,14 +131,14 @@ class ProcessTensorOp : public holoscan::Operator {
   }
 
   void compute(holoscan::InputContext& op_input, holoscan::OutputContext& op_output,
-               holoscan::ExecutionContext& context) override {
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     // The type of `in_message` is 'holoscan::TensorMap'.
     auto in_message = op_input.receive<holoscan::TensorMap>("in").value();
     // The type of out_message is TensorMap.
     holoscan::TensorMap out_message;
 
     for (auto& [key, tensor] : in_message) {  // Process with 'tensor' here.
-      cudaError_t cuda_status;
+      cudaError_t cuda_status{};
       size_t data_size = tensor->nbytes();
       std::vector<uint8_t> in_data(data_size);
       CUDA_TRY(cudaMemcpy(in_data.data(), tensor->data(), data_size, cudaMemcpyDeviceToHost));
@@ -210,7 +212,7 @@ class App : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<App>();
   app->run();
 
diff --git a/examples/multi_branch_pipeline/CMakeLists.txt b/examples/multi_branch_pipeline/CMakeLists.txt
index a3ac1334..0fa63c4d 100644
--- a/examples/multi_branch_pipeline/CMakeLists.txt
+++ b/examples/multi_branch_pipeline/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/multi_branch_pipeline/cpp/multi_branch_pipeline.cpp b/examples/multi_branch_pipeline/cpp/multi_branch_pipeline.cpp
index 7929d5f5..d260bc62 100644
--- a/examples/multi_branch_pipeline/cpp/multi_branch_pipeline.cpp
+++ b/examples/multi_branch_pipeline/cpp/multi_branch_pipeline.cpp
@@ -49,7 +49,8 @@ class PingTxOp : public Operator {
                static_cast<int64_t>(1));
   }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = initial_value_.get() + count_ * increment_.get();
     op_output.emit(value, "out");
     count_ += 1;
@@ -103,7 +104,8 @@ class IncrementOp : public Operator {
                static_cast<int64_t>(0));
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     int64_t value = op_input.receive<int64_t>("in").value();
     // increment value by the specified increment
     int64_t new_value = value + increment_.get();
@@ -122,7 +124,8 @@ class PingRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.input<int64_t>("in"); }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int64_t>("in").value();
     HOLOSCAN_LOG_INFO("receiver '{}' received value: {}", name(), value);
   };
@@ -177,7 +180,7 @@ class MultiRateApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, char** argv) {
   auto app = holoscan::make_application<MultiRateApp>();
 
   // Get the configuration
@@ -185,7 +188,7 @@ int main(int argc, char** argv) {
   config_path += "/multi_branch_pipeline.yaml";
   app->config(config_path);
 
-  std::string scheduler = app->from_config("scheduler").as<std::string>();
+  auto scheduler = app->from_config("scheduler").as<std::string>();
   if (scheduler == "multi_thread") {
     // use MultiThreadScheduler instead of the default GreedyScheduler
     app->scheduler(app->make_scheduler<holoscan::MultiThreadScheduler>(
diff --git a/examples/multithread/CMakeLists.txt b/examples/multithread/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/multithread/CMakeLists.txt
+++ b/examples/multithread/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/multithread/cpp/multithread.cpp b/examples/multithread/cpp/multithread.cpp
index f995464e..bc324769 100644
--- a/examples/multithread/cpp/multithread.cpp
+++ b/examples/multithread/cpp/multithread.cpp
@@ -35,7 +35,8 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<std::shared_ptr<int>>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     int value = 0;
     op_output.emit(value, "out");
   };
@@ -58,7 +59,8 @@ class DelayOp : public Operator {
     spec.param(silent_, "silent", "Silent mode?", "Whether to log info on receive", false);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("in").value();
 
     // increment value by the specified increment
@@ -104,7 +106,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<int>>("values", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     std::vector<int> value_vector;
     std::vector<std::string> name_vector;
     value_vector = op_input.receive<std::vector<int>>("values").value();
@@ -114,7 +117,7 @@ class PingRxOp : public Operator {
       HOLOSCAN_LOG_INFO("number of received values: {}", value_vector.size());
     }
     int total = 0;
-    for (auto vp : value_vector) { total += vp; }
+    for (const auto& vp : value_vector) { total += vp; }
     if (!silent_) { HOLOSCAN_LOG_INFO("sum of received values: {}", total); }
   };
 
@@ -159,7 +162,7 @@ class App : public holoscan::Application {
   bool silent_ = false;
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, char** argv) {
   auto app = holoscan::make_application<App>();
 
   // Get the configuration
@@ -168,23 +171,23 @@ int main(int argc, char** argv) {
   app->config(config_path);
 
   // Turn on data flow tracking if it is specified in the YAML
-  bool tracking = app->from_config("tracking").as<bool>();
+  auto tracking = app->from_config("tracking").as<bool>();
   holoscan::DataFlowTracker* tracker = nullptr;
   if (tracking) { tracker = &app->track(0, 0, 0); }
 
   // set customizable application parameters via the YAML
-  int num_delay_ops = app->from_config("num_delay_ops").as<int>();
-  double delay = app->from_config("delay").as<double>();
-  double delay_step = app->from_config("delay_step").as<double>();
-  int count = app->from_config("count").as<int>();
-  bool silent = app->from_config("silent").as<bool>();
+  auto num_delay_ops = app->from_config("num_delay_ops").as<int>();
+  auto delay = app->from_config("delay").as<double>();
+  auto delay_step = app->from_config("delay_step").as<double>();
+  auto count = app->from_config("count").as<int>();
+  auto silent = app->from_config("silent").as<bool>();
   app->set_num_delays(num_delay_ops);
   app->set_delay(delay);
   app->set_delay_step(delay_step);
   app->set_count(count);
   app->set_silent(silent);
 
-  std::string scheduler = app->from_config("scheduler").as<std::string>();
+  auto scheduler = app->from_config("scheduler").as<std::string>();
   if (scheduler == "multi_thread") {
     // use MultiThreadScheduler instead of the default GreedyScheduler
     app->scheduler(app->make_scheduler<holoscan::MultiThreadScheduler>(
diff --git a/examples/ping_any/cpp/ping_any.cpp b/examples/ping_any/cpp/ping_any.cpp
index cb92c78d..39c84044 100644
--- a/examples/ping_any/cpp/ping_any.cpp
+++ b/examples/ping_any/cpp/ping_any.cpp
@@ -30,10 +30,13 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<int>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = index_++;
     op_output.emit(value, "out");
   };
+
+ private:
   int index_ = 1;
 };
 
@@ -49,7 +52,8 @@ class PingMxOp : public Operator {
     spec.param(multiplier_, "multiplier", "Multiplier", "Multiply the input by this value", 2);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<std::any>("in").value();
 
     // Received value must be an int because MX's `in` is connected TX's `out`
@@ -83,7 +87,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<int>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value_vector = op_input.receive<std::vector<int>>("receivers").value();
 
     HOLOSCAN_LOG_INFO("Rx message received (count: {}, size: {})", count_++, value_vector.size());
@@ -115,7 +120,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/ping_conditional/cpp/ping_conditional.cpp b/examples/ping_conditional/cpp/ping_conditional.cpp
index 98175fda..2039bd16 100644
--- a/examples/ping_conditional/cpp/ping_conditional.cpp
+++ b/examples/ping_conditional/cpp/ping_conditional.cpp
@@ -1,6 +1,6 @@
 
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,15 +30,19 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<int*>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     HOLOSCAN_LOG_INFO("Tx message value: {}", ++index_);
-    if (index_ % 2) {
-      int* value = new int{index_};
+    if (index_ % 2 != 0) {
+      int* value = new int{index_};  // NOLINT(*)
       op_output.emit(value, "out");  // emit only odd values
     } else {
       op_output.emit(nullptr, "out");  // emit nullptr for even values
     }
-  }
+    index_++;
+  }  // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
+
+ private:
   int index_ = 0;
 };
 
@@ -54,15 +58,16 @@ class PingMxOp : public Operator {
     spec.param(multiplier_, "multiplier", "Multiplier", "Multiply the input by this value", 2);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
-    auto value = op_input.receive<int*>("in").value();
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
+    auto* value = op_input.receive<int*>("in").value();
 
     HOLOSCAN_LOG_INFO("Middle message received (count: {})", count_++);
 
-    if (value) {
+    if (value != nullptr) {
       HOLOSCAN_LOG_INFO("Middle message value: {}", *value);
       op_output.emit((*value) * multiplier_, "out");
-      delete value;
+      delete value;  // NOLINT(*)
     }
   };
 
@@ -81,7 +86,8 @@ class PingRxOp : public Operator {
     spec.input<int>("in").condition(holoscan::ConditionType::kNone);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto received_value = op_input.receive<int>("in");
 
     HOLOSCAN_LOG_INFO("Rx message received (count: {})", count_++);
@@ -115,7 +121,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/ping_custom_op/CMakeLists.txt b/examples/ping_custom_op/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/ping_custom_op/CMakeLists.txt
+++ b/examples/ping_custom_op/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/ping_custom_op/cpp/ping_custom_op.cpp b/examples/ping_custom_op/cpp/ping_custom_op.cpp
index c7e7b03f..fd682fb3 100644
--- a/examples/ping_custom_op/cpp/ping_custom_op.cpp
+++ b/examples/ping_custom_op/cpp/ping_custom_op.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,8 @@ class PingMxOp : public Operator {
     spec.param(multiplier_, "multiplier", "Multiplier", "Multiply the input by this value", 2);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("in").value();
 
     std::cout << "Middle message value: " << value << std::endl;
@@ -65,7 +66,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/ping_cycle/cpp/ping_cycle.cpp b/examples/ping_cycle/cpp/ping_cycle.cpp
index 8f619197..d775fdb0 100644
--- a/examples/ping_cycle/cpp/ping_cycle.cpp
+++ b/examples/ping_cycle/cpp/ping_cycle.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,7 +31,8 @@ class PingMxOp : public Operator {
     spec.param(multiplier_, "multiplier", "Multiplier", "Multiply the input by this value", 2);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("in");
 
     int out_value = 1;
@@ -67,7 +68,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/ping_distributed/CMakeLists.txt b/examples/ping_distributed/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/ping_distributed/CMakeLists.txt
+++ b/examples/ping_distributed/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/ping_distributed/README.md b/examples/ping_distributed/README.md
index 5e07f855..890970cf 100644
--- a/examples/ping_distributed/README.md
+++ b/examples/ping_distributed/README.md
@@ -132,3 +132,5 @@ python3 ${APP_DIR}/ping_distributed.py
 Add an additional `--gpu` to the command line to use a GPU tensor instead of a host one.
 
 Note that for this application "fragment1" sends the video frames and "fragment2" receives them (these fragment names were assigned during the `MyPingApp.compose` method for this application. In this case, "fragment2" has the receiver operator that logs messages to the terminal, so the process that runs that fragment will display the application output. We could omit the `--fragments` arguments altogether if we wanted to let holoscan automatically decide which nodes to run each fragment on. We chose to explicitly specify the fragments here so the user of the application knows which node to expect to see the output on.
+
+The `--track` argument can be specified to enable the distributed data flow tracking feature (to measure timings along various paths in the computation graph). Currently this should only be used when the fragments are run on a single node as time synchronization across multiple nodes is not yet automatically handled.
diff --git a/examples/ping_distributed/cpp/CMakeLists.min.txt b/examples/ping_distributed/cpp/CMakeLists.min.txt
index d4f6d155..1fb4e91d 100644
--- a/examples/ping_distributed/cpp/CMakeLists.min.txt
+++ b/examples/ping_distributed/cpp/CMakeLists.min.txt
@@ -33,20 +33,49 @@ target_link_libraries(ping_distributed
 
 # Testing
 if(HOLOSCAN_BUILD_TESTS)
-  add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_TEST
+  # For iGPU, a call to `ucp_init_version` with CUDA_VISIBLE_DEVICES="" seems to cause a segfault.
+  # Limit the following test case to x86_64 systems to avoid this.
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+    # emulate a x86_64 system without any GPUs by setting CUDA_VISIBLE_DEVICES=""
+    add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_CPU_ONLY_TEST
+      COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    )
+    set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_CPU_ONLY_TEST PROPERTIES
+      ENVIRONMENT "CUDA_VISIBLE_DEVICES="
+      PASS_REGULAR_EXPRESSION "no CUDA-capable device is detected"
+      PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'tensor', shape: \\(32, 64\\)"
+      FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
+    )
+  endif()
+
+  # test with CPU tensors, but don't explicitly make the GPU not visible
+  add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_CPU_TEST
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
   )
-  set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_TEST PROPERTIES
+  set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_CPU_TEST PROPERTIES
     PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'tensor', shape: \\(32, 64\\)"
   )
 
+  # test with GPU tensors
   add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_GPU_TEST
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed --gpu
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
   )
   set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_GPU_TEST PROPERTIES
     PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'tensor', shape: \\(32, 64\\)"
+    FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
+  )
+
+  add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_FLOW_TRACKING_TEST
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed --track
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  )
+  set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_FLOW_TRACKING_TEST PROPERTIES
+    PASS_REGULAR_EXPRESSION "Data Flow Tracking Results"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment2"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment1"
   )
 
   add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_HELP_STRING_TEST
diff --git a/examples/ping_distributed/cpp/CMakeLists.txt b/examples/ping_distributed/cpp/CMakeLists.txt
index f50f5fa1..684ebe2f 100644
--- a/examples/ping_distributed/cpp/CMakeLists.txt
+++ b/examples/ping_distributed/cpp/CMakeLists.txt
@@ -59,15 +59,32 @@ install(TARGETS ping_distributed
 
 # Testing
 if(HOLOSCAN_BUILD_TESTS)
-  add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_TEST
+  # For iGPU, a call to `ucp_init_version` with CUDA_VISIBLE_DEVICES="" seems to cause a segfault.
+  # Limit the following test case to x86_64 systems to avoid this.
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
+    # emulate a x86_64 system without any GPUs by setting CUDA_VISIBLE_DEVICES=""
+    add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_CPU_ONLY_TEST
+      COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    )
+    set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_CPU_ONLY_TEST PROPERTIES
+      ENVIRONMENT "CUDA_VISIBLE_DEVICES="
+      PASS_REGULAR_EXPRESSION "no CUDA-capable device is detected"
+      PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'tensor', shape: \\(32, 64\\)"
+      FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
+    )
+  endif()
+
+  # test with CPU tensors, but don't explicitly make the GPU not visible
+  add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_CPU_TEST
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
   )
-  set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_TEST PROPERTIES
+  set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_CPU_TEST PROPERTIES
     PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'tensor', shape: \\(32, 64\\)"
-    FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
   )
 
+  # test with GPU tensors
   add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_GPU_TEST
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed --gpu
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
@@ -77,6 +94,16 @@ if(HOLOSCAN_BUILD_TESTS)
     FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
   )
 
+  add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_FLOW_TRACKING_TEST
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed --track
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  )
+  set_tests_properties(EXAMPLE_CPP_PING_DISTRIBUTED_FLOW_TRACKING_TEST PROPERTIES
+    PASS_REGULAR_EXPRESSION "Data Flow Tracking Results"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment2"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment1"
+  )
+
   add_test(NAME EXAMPLE_CPP_PING_DISTRIBUTED_HELP_STRING_TEST
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/ping_distributed --help
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/examples/ping_distributed/cpp/ping_distributed.cpp b/examples/ping_distributed/cpp/ping_distributed.cpp
index 6529ac9b..0c78d37b 100644
--- a/examples/ping_distributed/cpp/ping_distributed.cpp
+++ b/examples/ping_distributed/cpp/ping_distributed.cpp
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <algorithm>
 #include <iostream>
 #include <string>
@@ -25,8 +26,9 @@
 
 class Fragment1 : public holoscan::Fragment {
  public:
-  Fragment1(bool gpu_tensor = false, int64_t count = 10, int32_t batch_size = 0, int32_t rows = 32,
-            int32_t columns = 64, int32_t channels = 0, const std::string& data_type = "uint8_t")
+  // NOLINTNEXTLINE(modernize-pass-by-value,bugprone-easily-swappable-parameters)
+  Fragment1(bool gpu_tensor, int64_t count, int32_t batch_size, int32_t rows, int32_t columns,
+            int32_t channels, const std::string& data_type)
       : gpu_tensor_(gpu_tensor),
         batch_size_(batch_size),
         count_(count),
@@ -76,6 +78,7 @@ class App : public holoscan::Application {
   // Inherit the constructor
   using Application::Application;
 
+  // NOLINTNEXTLINE(modernize-pass-by-value,bugprone-easily-swappable-parameters)
   void set_options(bool gpu_tensor = false, int64_t count = 10, int32_t batch_size = 0,
                    int32_t rows = 32, int32_t columns = 1024, int32_t channels = 0,
                    const std::string& data_type = "uint8_t") {
@@ -88,6 +91,7 @@ class App : public holoscan::Application {
     channels_ = channels;
     data_type_ = data_type;
   }
+  // NOLINTEND(fuchsia-default-arguments-declarations)
 
   void compose() override {
     using namespace holoscan;
@@ -161,6 +165,7 @@ int main() {
             << "                      {'int8_t', 'int16_t', 'int32_t', 'int64_t',  \n"
             << "                       'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',  \n"
             << "                       'float', 'double', 'complex<float>', complex<double>'}.\n"
+            << "  --track             If specified, data flow tracking will be enabled.\n"
             << std::endl;
 
   auto app = holoscan::make_application<App>();
@@ -171,19 +176,33 @@ int main() {
   // Parse any additional supported arguments
   bool tensor_on_gpu = get_boolean_arg(remaining_args, "--gpu").value_or(false);
   int64_t count = get_int64_arg(remaining_args, "--count").value_or(10);
-  int64_t batch_size = get_int32_arg(remaining_args, "--batch_size").value_or(0);
+  int32_t batch_size = get_int32_arg(remaining_args, "--batch_size").value_or(0);
   int32_t rows = get_int32_arg(remaining_args, "--rows").value_or(32);
   int32_t columns = get_int32_arg(remaining_args, "--columns").value_or(64);
   int32_t channels = get_int32_arg(remaining_args, "--channels").value_or(0);
   std::string data_type = get_str_arg(remaining_args, "--data_type").value_or("uint8_t");
+  bool data_flow_tracking_enabled = get_boolean_arg(remaining_args, "--track").value_or(false);
 
   HOLOSCAN_LOG_INFO("Running ping with tensors on {}.", tensor_on_gpu ? "GPU" : "host");
 
   // configure tensor on host vs. GPU and set the count and shape
   app->set_options(tensor_on_gpu, count, batch_size, rows, columns, channels, data_type);
 
-  // run the application
-  app->run();
+  if (data_flow_tracking_enabled) {
+    // enable data flow tracking for a distributed app
+    auto trackers = app->track_distributed(0, 0, 0);
+
+    // run the application
+    app->run();
+
+    // print data flow tracking results
+    for (const auto& [name, tracker] : trackers) {
+      std::cout << "Fragment: " << name << std::endl;
+      tracker->print();
+    }
+  } else {
+    app->run();
+  }
 
   return 0;
 }
diff --git a/examples/ping_distributed/python/CMakeLists.min.txt b/examples/ping_distributed/python/CMakeLists.min.txt
index 5004b58e..62176088 100644
--- a/examples/ping_distributed/python/CMakeLists.min.txt
+++ b/examples/ping_distributed/python/CMakeLists.min.txt
@@ -37,6 +37,17 @@ if(BUILD_TESTING)
     FAIL_REGULAR_EXPRESSION "Unable to convert argument type"
   )
 
+  add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_FLOW_TRACKING_TEST
+    COMMAND python3 ping_distributed.py --track
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+  set_tests_properties(EXAMPLE_PYTHON_PING_DISTRIBUTED_FLOW_TRACKING_TEST PROPERTIES
+    PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'out', shape: \\(32, 64\\)"
+    PASS_REGULAR_EXPRESSION "Data Flow Tracking Results"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment2"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment1"
+  )
+
   add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_HELP_STRING_TEST
     COMMAND python3 ping_distributed.py --help
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
diff --git a/examples/ping_distributed/python/CMakeLists.txt b/examples/ping_distributed/python/CMakeLists.txt
index 122887a5..539a2d5c 100644
--- a/examples/ping_distributed/python/CMakeLists.txt
+++ b/examples/ping_distributed/python/CMakeLists.txt
@@ -39,26 +39,37 @@ install(FILES CMakeLists.min.txt
 
 # Testing
 if(HOLOSCAN_BUILD_TESTS)
-  add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_TEST
-    COMMAND python3 ping_distributed.py
+add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_TEST
+COMMAND python3 ping_distributed.py
+WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+)
+set_tests_properties(EXAMPLE_PYTHON_PING_DISTRIBUTED_TEST PROPERTIES
+PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'out', shape: \\(32, 64\\)"
+FAIL_REGULAR_EXPRESSION "AssertionError:"
+FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
+FAIL_REGULAR_EXPRESSION "Unable to convert argument type"
+)
+
+  add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_GPU_TEST
+    COMMAND python3 ping_distributed.py --gpu
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
   )
-  set_tests_properties(EXAMPLE_PYTHON_PING_DISTRIBUTED_TEST PROPERTIES
+  set_tests_properties(EXAMPLE_PYTHON_PING_DISTRIBUTED_GPU_TEST PROPERTIES
     PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'out', shape: \\(32, 64\\)"
     FAIL_REGULAR_EXPRESSION "AssertionError:"
     FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
     FAIL_REGULAR_EXPRESSION "Unable to convert argument type"
   )
 
-  add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_GPU_TEST
-    COMMAND python3 ping_distributed.py --gpu
+  add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_FLOW_TRACKING_TEST
+    COMMAND python3 ping_distributed.py --track
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
   )
-  set_tests_properties(EXAMPLE_PYTHON_PING_DISTRIBUTED_GPU_TEST PROPERTIES
+  set_tests_properties(EXAMPLE_PYTHON_PING_DISTRIBUTED_FLOW_TRACKING_TEST PROPERTIES
     PASS_REGULAR_EXPRESSION "rx received message 10: Tensor key: 'out', shape: \\(32, 64\\)"
-    FAIL_REGULAR_EXPRESSION "AssertionError:"
-    FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
-    FAIL_REGULAR_EXPRESSION "Unable to convert argument type"
+    PASS_REGULAR_EXPRESSION "Data Flow Tracking Results"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment2"
+    PASS_REGULAR_EXPRESSION "Fragment: fragment1"
   )
 
   add_test(NAME EXAMPLE_PYTHON_PING_DISTRIBUTED_HELP_STRING_TEST
diff --git a/examples/ping_distributed/python/ping_distributed.py b/examples/ping_distributed/python/ping_distributed.py
index eb42b378..460867c3 100644
--- a/examples/ping_distributed/python/ping_distributed.py
+++ b/examples/ping_distributed/python/ping_distributed.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from holoscan.conditions import CountCondition
-from holoscan.core import Application, Fragment
+from holoscan.core import Application, Fragment, Tracker
 from holoscan.operators import PingTensorRxOp, PingTensorTxOp
 
 
@@ -98,9 +98,23 @@ def compose(self):
         self.add_flow(fragment1, fragment2, {("tx", "rx")})
 
 
-def main(on_gpu=False, count=10, shape=(64, 32), dtype=np.uint8):
+def main(on_gpu=False, count=10, shape=(64, 32), dtype=np.uint8, data_flow_tracking_enabled=False):
     app = MyPingApp(gpu_tensor=on_gpu, count=count, shape=shape, dtype=dtype)
-    app.run()
+
+    if data_flow_tracking_enabled:
+        with Tracker(
+            app,
+            filename="logger.log",
+            num_start_messages_to_skip=2,
+            num_last_messages_to_discard=3,
+        ) as trackers:
+            app.run()
+            print(f"{type(trackers)=}, {trackers=}")
+            for fragment_name, tracker in trackers.items():
+                print(f"Fragment: {fragment_name}")
+                tracker.print()
+    else:
+        app.run()
 
 
 if __name__ == "__main__":
@@ -163,6 +177,11 @@ def main(on_gpu=False, count=10, shape=(64, 32), dtype=np.uint8):
             " 'uint64_t', 'float', 'double', 'complex<float>', 'complex<double>'}"
         ),
     )
+    parser.add_argument(
+        "--track",
+        action="store_true",
+        help="Enable data flow tracking for the distributed app",
+    )
     # use parse_known_args to ignore other CLI arguments that may be used by Application
     args, remaining = parser.parse_known_args()
 
@@ -196,4 +215,5 @@ def main(on_gpu=False, count=10, shape=(64, 32), dtype=np.uint8):
         count=args.count,
         shape=shape,
         dtype=args.data_type,
+        data_flow_tracking_enabled=args.track,
     )
diff --git a/examples/ping_multi_port/CMakeLists.txt b/examples/ping_multi_port/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/ping_multi_port/CMakeLists.txt
+++ b/examples/ping_multi_port/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/ping_multi_port/cpp/ping_multi_port.cpp b/examples/ping_multi_port/cpp/ping_multi_port.cpp
index 007749b7..1dea98cd 100644
--- a/examples/ping_multi_port/cpp/ping_multi_port.cpp
+++ b/examples/ping_multi_port/cpp/ping_multi_port.cpp
@@ -28,9 +28,21 @@ class ValueData {
   }
   ~ValueData() { HOLOSCAN_LOG_TRACE("ValueData::~ValueData(): {}", data_); }
 
+  // Use default copy constructor
+  ValueData(const ValueData&) = default;
+
+  // Use default move constructor
+  ValueData(ValueData&&) noexcept = default;
+
+  // Use default copy assignment operator
+  ValueData& operator=(const ValueData&) = default;
+
+  // Use default move assignment operator
+  ValueData& operator=(ValueData&&) noexcept = default;
+
   void data(int value) { data_ = value; }
 
-  int data() const { return data_; }
+  [[nodiscard]] int data() const { return data_; }
 
  private:
   int data_;
@@ -49,13 +61,16 @@ class PingTxOp : public Operator {
     spec.output<std::shared_ptr<ValueData>>("out2");
   }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = std::make_shared<ValueData>(index_++);
     op_output.emit(value1, "out1");
 
     auto value2 = std::make_shared<ValueData>(index_++);
     op_output.emit(value2, "out2");
   };
+
+ private:
   int index_ = 1;
 };
 
@@ -109,7 +124,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<std::shared_ptr<ValueData>>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value_vector =
         op_input.receive<std::vector<std::shared_ptr<ValueData>>>("receivers").value();
 
@@ -143,7 +159,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/ping_simple/CMakeLists.txt b/examples/ping_simple/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/ping_simple/CMakeLists.txt
+++ b/examples/ping_simple/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/ping_simple/cpp/ping_simple.cpp b/examples/ping_simple/cpp/ping_simple.cpp
index beb7447c..ca0e1d6f 100644
--- a/examples/ping_simple/cpp/ping_simple.cpp
+++ b/examples/ping_simple/cpp/ping_simple.cpp
@@ -32,7 +32,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/ping_simple_run_async/CMakeLists.txt b/examples/ping_simple_run_async/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/ping_simple_run_async/CMakeLists.txt
+++ b/examples/ping_simple_run_async/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/ping_simple_run_async/cpp/ping_simple_run_async.cpp b/examples/ping_simple_run_async/cpp/ping_simple_run_async.cpp
index d3a3fcb4..3b1fe9f6 100644
--- a/examples/ping_simple_run_async/cpp/ping_simple_run_async.cpp
+++ b/examples/ping_simple_run_async/cpp/ping_simple_run_async.cpp
@@ -33,12 +33,16 @@ class App : public holoscan::Application {
     add_flow(tx, rx);
 
     // Save a reference to the tx operator so we can access it later
-    target_op = tx;
+    target_op_ = tx;
   }
-  std::shared_ptr<holoscan::Operator> target_op;
+
+  std::shared_ptr<holoscan::Operator> target_op() { return target_op_; }
+
+ private:
+  std::shared_ptr<holoscan::Operator> target_op_;
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<App>();
   auto future = app->run_async();
   HOLOSCAN_LOG_INFO("Application is running asynchronously.");
@@ -50,14 +54,13 @@ int main(int argc, char** argv) {
       if (status == std::future_status::ready) {
         HOLOSCAN_LOG_INFO("# Application finished");
         return;
+      }
+      // Print the current index of the tx operator
+      auto tx = std::dynamic_pointer_cast<holoscan::ops::PingTxOp>(app->target_op());
+      if (tx) {
+        HOLOSCAN_LOG_INFO("# Application still running... PingTxOp index: {}", tx->index());
       } else {
-        // Print the current index of the tx operator
-        auto tx = std::dynamic_pointer_cast<holoscan::ops::PingTxOp>(app->target_op);
-        if (tx) {
-          HOLOSCAN_LOG_INFO("# Application still running... PingTxOp index: {}", tx->index());
-        } else {
-          HOLOSCAN_LOG_INFO("# Application still running... PingTxOp index: {}", "N/A");
-        }
+        HOLOSCAN_LOG_INFO("# Application still running... PingTxOp index: {}", "N/A");
       }
       std::this_thread::yield();
     }
diff --git a/examples/ping_vector/CMakeLists.txt b/examples/ping_vector/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/ping_vector/CMakeLists.txt
+++ b/examples/ping_vector/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/ping_vector/cpp/ping_vector.cpp b/examples/ping_vector/cpp/ping_vector.cpp
index 4bcd2631..748e0fc6 100644
--- a/examples/ping_vector/cpp/ping_vector.cpp
+++ b/examples/ping_vector/cpp/ping_vector.cpp
@@ -29,14 +29,18 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<std::vector<int>>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = index_++;
 
     std::vector<int> output;
+    output.reserve(5);
     for (int i = 0; i < 5; i++) { output.push_back(value1++); }
 
     op_output.emit(output, "out");
   };
+
+ private:
   int index_ = 1;
 };
 
@@ -61,10 +65,10 @@ class PingMxOp : public Operator {
 
     std::vector<int> values2;
     std::vector<int> values3;
-    for (int i = 0; i < values1.size(); i++) {
-      HOLOSCAN_LOG_INFO("Middle message value: {}", values1[i]);
-      values2.push_back(values1[i] * multiplier_);
-      values3.push_back(values1[i] * multiplier_ * multiplier_);
+    for (const auto& val : values1) {
+      HOLOSCAN_LOG_INFO("Middle message value: {}", val);
+      values2.push_back(val * multiplier_);
+      values3.push_back(val * multiplier_ * multiplier_);
     }
 
     op_output.emit(values1, "out1");
@@ -93,7 +97,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<std::vector<int>>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto receiver_vector = op_input.receive<std::vector<std::vector<int>>>("receivers").value();
     auto input_vector = op_input.receive<std::vector<int>>("in").value();
     auto dup_input_vector = op_input.receive<std::vector<int>>("dup_in").value();
@@ -106,15 +111,15 @@ class PingRxOp : public Operator {
         dup_input_vector.size(),
         receiver_vector.size());
 
-    for (int i = 0; i < input_vector.size(); i++) {
+    for (int i = 0; i < input_vector.size(); i++) {  // NOLINT(*)
       HOLOSCAN_LOG_INFO("Rx message input value[{}]: {}", i, input_vector[i]);
     }
 
-    for (int i = 0; i < dup_input_vector.size(); i++) {
+    for (int i = 0; i < dup_input_vector.size(); i++) {  // NOLINT(*)
       HOLOSCAN_LOG_INFO("Rx message duplicated input value[{}]: {}", i, dup_input_vector[i]);
     }
 
-    for (int i = 0; i < receiver_vector.size(); i++) {
+    for (int i = 0; i < receiver_vector.size(); i++) {  // NOLINT(*)
       for (int j = 0; j < receiver_vector[i].size(); j++) {
         HOLOSCAN_LOG_INFO("Rx message receiver value[{}][{}]: {}", i, j, receiver_vector[i][j]);
       }
@@ -146,7 +151,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/resources/clock/CMakeLists.txt b/examples/resources/clock/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/resources/clock/CMakeLists.txt
+++ b/examples/resources/clock/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/resources/clock/cpp/ping_clock.cpp b/examples/resources/clock/cpp/ping_clock.cpp
index 74b2d33d..18d74864 100644
--- a/examples/resources/clock/cpp/ping_clock.cpp
+++ b/examples/resources/clock/cpp/ping_clock.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,14 +34,16 @@ class TimedPingRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 };
 
 void TimedPingRxOp::setup(OperatorSpec& spec) {
   spec.input<int>("in");
 }
 
-void TimedPingRxOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void TimedPingRxOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                            [[maybe_unused]] ExecutionContext& context) {
   auto value = op_input.receive<int>("in").value();
   HOLOSCAN_LOG_INFO("Rx message value: {}", value);
 
@@ -108,7 +110,7 @@ class MyPingApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<MyPingApp>();
   app->run();
 
diff --git a/examples/resources/native/CMakeLists.txt b/examples/resources/native/CMakeLists.txt
index a3ac1334..0fa63c4d 100644
--- a/examples/resources/native/CMakeLists.txt
+++ b/examples/resources/native/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/resources/native/cpp/native_resource.cpp b/examples/resources/native/cpp/native_resource.cpp
index 720a4dcf..72f092bb 100644
--- a/examples/resources/native/cpp/native_resource.cpp
+++ b/examples/resources/native/cpp/native_resource.cpp
@@ -45,7 +45,8 @@ class MinimalNativeResourceOp : public Operator {
 
   MinimalNativeResourceOp() = default;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto res = resource<MinimalNativeResource>("string_native_resource");
     if (res) {
       HOLOSCAN_LOG_INFO("MinimalNativeResource - string_native_resource.string_param: {}",
@@ -85,7 +86,7 @@ class MinimalNativeResourceApp : public holoscan::Application {
 
 }  // namespace holoscan
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<holoscan::MinimalNativeResourceApp>();
   app->run();
 
diff --git a/examples/tensor_interop/CMakeLists.txt b/examples/tensor_interop/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/tensor_interop/CMakeLists.txt
+++ b/examples/tensor_interop/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/tensor_interop/cpp/tensor_interop.cpp b/examples/tensor_interop/cpp/tensor_interop.cpp
index c5682bbf..8cb68f09 100644
--- a/examples/tensor_interop/cpp/tensor_interop.cpp
+++ b/examples/tensor_interop/cpp/tensor_interop.cpp
@@ -91,7 +91,7 @@ class ProcessTensorOp : public Operator {
     TensorMap out_message;
 
     for (auto& [key, tensor] : in_message) {  // Process with 'tensor' here.
-      cudaError_t cuda_status;
+      cudaError_t cuda_status{};
       size_t data_size = tensor->nbytes();
       std::vector<uint8_t> in_data(data_size);
       CUDA_TRY(cudaMemcpy(in_data.data(), tensor->data(), data_size, cudaMemcpyDeviceToHost));
@@ -172,7 +172,7 @@ class App : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, [[maybe_unused]] char** argv) {
   auto app = holoscan::make_application<App>();
   app->run();
 
diff --git a/examples/tensor_interop/python/CMakeLists.txt b/examples/tensor_interop/python/CMakeLists.txt
index 62fd082c..10ad80c0 100644
--- a/examples/tensor_interop/python/CMakeLists.txt
+++ b/examples/tensor_interop/python/CMakeLists.txt
@@ -51,7 +51,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_tensor_interop_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/tensor_interop/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/tensor_interop/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -65,7 +65,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT tensor_interop_test.py
   PRE_LINK
   COMMAND patch -u -o tensor_interop_test.py ${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop.py
-      ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/tensor_interop/python_tensor_interop.patch
+    ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/tensor_interop/python_tensor_interop.patch
   )
 
   add_custom_target(python_tensor_interop_test ALL
@@ -82,7 +82,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_PYTHON_TENSOR_INTEROP_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/v4l2_camera/CMakeLists.txt b/examples/v4l2_camera/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/v4l2_camera/CMakeLists.txt
+++ b/examples/v4l2_camera/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/v4l2_camera/cpp/v4l2_camera.cpp b/examples/v4l2_camera/cpp/v4l2_camera.cpp
index c4f396b2..28907853 100644
--- a/examples/v4l2_camera/cpp/v4l2_camera.cpp
+++ b/examples/v4l2_camera/cpp/v4l2_camera.cpp
@@ -24,7 +24,7 @@
 
 bool key_exists(const holoscan::ArgList& config, const std::string& key) {
   bool exists = false;
-  for (auto& arg : config) {
+  for (const auto& arg : config) {
     if (arg.name() == key) {
       exists = true;
       break;
@@ -43,8 +43,8 @@ class App : public holoscan::Application {
 
     if (key_exists(from_config("source"), "width") && key_exists(from_config("source"), "height")) {
       // width and height given, use BlockMemoryPool (better latency)
-      const uint64_t width = from_config("source.width").as<uint64_t>();
-      const uint64_t height = from_config("source.height").as<uint64_t>();
+      auto width = from_config("source.width").as<uint64_t>();
+      auto height = from_config("source.height").as<uint64_t>();
       const uint8_t n_channels = 4;
       uint64_t block_size = width * height * n_channels;
       auto allocator = make_resource<BlockMemoryPool>("pool", 0, block_size, 1);
@@ -55,9 +55,11 @@ class App : public holoscan::Application {
       // Set Holoviz width and height from source resolution
       auto viz_args = from_config("visualizer");
       for (auto& arg : from_config("source")) {
-        if (arg.name() == "width") viz_args.add(arg);
-        else if (arg.name() == "height")
+        if (arg.name() == "width") {
           viz_args.add(arg);
+        } else if (arg.name() == "height") {
+          viz_args.add(arg);
+        }
       }
       visualizer =
           make_operator<ops::HolovizOp>("visualizer", viz_args, Arg("allocator") = allocator);
diff --git a/examples/video_replayer/CMakeLists.txt b/examples/video_replayer/CMakeLists.txt
index 15697f79..c9769ebc 100644
--- a/examples/video_replayer/CMakeLists.txt
+++ b/examples/video_replayer/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/video_replayer/README.md b/examples/video_replayer/README.md
index 470e25ff..72ef7292 100644
--- a/examples/video_replayer/README.md
+++ b/examples/video_replayer/README.md
@@ -4,7 +4,7 @@ Minimal example to demonstrate the use of the video stream replayer operator to
 
 The video frames need to have been converted to a gxf entity format to use as input. You can use the `convert_video_to_gxf_entities.py` script installed in `/opt/nvidia/holoscan/bin` or available [on GitHub](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/scripts#convert_video_to_gxf_entitiespy) (tensors will be loaded on the GPU).
 
-> Note: Support for H264 stream support is in progress and can be found on [HoloHub](https://nvidia-holoscan.github.io/holohub)
+> Note: Support for H264 stream support is in progress and can be found on [HoloHub](https://github.com/nvidia-holoscan/holohub)
 
 *Visit the [SDK User Guide](https://docs.nvidia.com/holoscan/sdk-user-guide/examples/video_replayer.html) for step-by-step documentation of this example.*
 
diff --git a/examples/video_replayer/cpp/CMakeLists.txt b/examples/video_replayer/cpp/CMakeLists.txt
index 681d76a4..7943fdbf 100644
--- a/examples/video_replayer/cpp/CMakeLists.txt
+++ b/examples/video_replayer/cpp/CMakeLists.txt
@@ -72,7 +72,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME video_replayer_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -91,7 +91,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT video_replayer_test.cpp
     PRE_LINK
     COMMAND patch -u -o video_replayer_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/video_replayer.cpp
-            ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/cpp_video_replayer.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/cpp_video_replayer.patch
   )
 
   # Create the test executable
@@ -100,7 +100,7 @@ if(HOLOSCAN_BUILD_TESTS)
   )
 
   target_include_directories(video_replayer_test
-    PRIVATE ${CMAKE_SOURCE_DIR}/tests)
+    PRIVATE ${Holoscan-examples_SOURCE_DIR}/../tests)
 
   target_compile_definitions(video_replayer_test
     PRIVATE RECORD_OUTPUT RECORDING_DIR="${RECORDING_DIR}"
@@ -127,7 +127,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_CPP_VIDEO_REPLAYER_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/video_replayer/cpp/video_replayer.cpp b/examples/video_replayer/cpp/video_replayer.cpp
index 2cef29dd..19a190ae 100644
--- a/examples/video_replayer/cpp/video_replayer.cpp
+++ b/examples/video_replayer/cpp/video_replayer.cpp
@@ -27,12 +27,16 @@ class VideoReplayerApp : public holoscan::Application {
 
     // Sets the data directory to use from the environment variable if it is set
     ArgList args;
-    auto data_directory = std::getenv("HOLOSCAN_INPUT_PATH");
+    auto* data_directory = std::getenv("HOLOSCAN_INPUT_PATH");  // NOLINT(*)
     if (data_directory != nullptr && data_directory[0] != '\0') {
       auto video_directory = std::filesystem::path(data_directory);
       video_directory /= "racerx";
       args.add(Arg("directory", video_directory.string()));
     }
+    // create an allocator supporting both host and device memory pools
+    // (The video stream is copied to an intermediate host buffer before being copied to the GPU)
+    args.add(Arg("allocator",
+                 make_resource<RMMAllocator>("rmm_allocator", from_config("rmm_allocator"))));
 
     // Define the replayer and holoviz operators and configure using yaml configuration
     auto replayer =
@@ -43,7 +47,7 @@ class VideoReplayerApp : public holoscan::Application {
     add_flow(replayer, visualizer, {{"output", "receivers"}});
 
     // Check if the YAML dual_window parameter is set and add a second visualizer in that case
-    bool dual_window = from_config("dual_window").as<bool>();
+    auto dual_window = from_config("dual_window").as<bool>();
     if (dual_window) {
       auto visualizer2 = make_operator<ops::HolovizOp>("holoviz2", from_config("holoviz"));
       add_flow(replayer, visualizer2, {{"output", "receivers"}});
diff --git a/examples/video_replayer/cpp/video_replayer.yaml b/examples/video_replayer/cpp/video_replayer.yaml
index 5ec00f4c..45ce3e91 100644
--- a/examples/video_replayer/cpp/video_replayer.yaml
+++ b/examples/video_replayer/cpp/video_replayer.yaml
@@ -24,6 +24,15 @@ replayer:
   realtime: true  # default: true
   count: 0        # default: 0 (no frame count restriction)
 
+# Initial size below is set to 8 MB which is sufficient for
+# a 1920 * 1080 RGBA image (uint8_t).
+rmm_allocator:
+  device_memory_initial_size: "8 MB"
+  device_memory_max_size: "8 MB"
+  host_memory_initial_size: "8 MB"
+  host_memory_max_size: "8 MB"
+  dev_id: 0
+
 holoviz:
   width: 854
   height: 480
diff --git a/examples/video_replayer/python/CMakeLists.txt b/examples/video_replayer/python/CMakeLists.txt
index 25dd1e75..412bcb93 100644
--- a/examples/video_replayer/python/CMakeLists.txt
+++ b/examples/video_replayer/python/CMakeLists.txt
@@ -52,7 +52,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_video_replayer_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -73,7 +73,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT video_replayer_test.py
     PRE_LINK
     COMMAND patch -u -o video_replayer_test.py ${CMAKE_CURRENT_SOURCE_DIR}/video_replayer.py
-        ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/python_video_replayer.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/python_video_replayer.patch
   )
 
   add_custom_target(python_video_replayer_test ALL
@@ -92,7 +92,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_PYTHON_VIDEO_REPLAYER_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/video_replayer/python/video_replayer.py b/examples/video_replayer/python/video_replayer.py
index 6433ea73..f402349e 100644
--- a/examples/video_replayer/python/video_replayer.py
+++ b/examples/video_replayer/python/video_replayer.py
@@ -19,6 +19,7 @@
 
 from holoscan.core import Application
 from holoscan.operators import HolovizOp, VideoStreamReplayerOp
+from holoscan.resources import RMMAllocator
 
 sample_data_path = os.environ.get("HOLOSCAN_INPUT_PATH", "../data")
 
@@ -40,9 +41,17 @@ def compose(self):
         if not os.path.exists(video_dir):
             raise ValueError(f"Could not find video data: {video_dir=}")
 
+        # create an allocator supporting both host and device memory pools
+        # (The video stream is copied to an intermediate host buffer before being copied to the GPU)
+        rmm_allocator = RMMAllocator(self, name="rmm-allocator", **self.kwargs("rmm_allocator"))
+
         # Define the replayer and holoviz operators
         replayer = VideoStreamReplayerOp(
-            self, name="replayer", directory=video_dir, **self.kwargs("replayer")
+            self,
+            name="replayer",
+            directory=video_dir,
+            **self.kwargs("replayer"),
+            allocator=rmm_allocator,
         )
         visualizer = HolovizOp(self, name="holoviz", **self.kwargs("holoviz"))
 
diff --git a/examples/video_replayer/python/video_replayer.yaml b/examples/video_replayer/python/video_replayer.yaml
index 3dfa98d7..a6ccc49d 100644
--- a/examples/video_replayer/python/video_replayer.yaml
+++ b/examples/video_replayer/python/video_replayer.yaml
@@ -23,6 +23,15 @@ replayer:
   realtime: true  # default: true
   count: 0        # default: 0 (no frame count restriction)
 
+# Initial size below is set to 8 MB which is sufficient for
+# a 1920 * 1080 RGBA image (uint8_t).
+rmm_allocator:
+  device_memory_initial_size: "8MB"
+  device_memory_max_size: "8MB"
+  host_memory_initial_size: "8MB"
+  host_memory_max_size: "8MB"
+  dev_id: 0
+
 holoviz:
   width: 854
   height: 480
diff --git a/examples/video_replayer_distributed/CMakeLists.txt b/examples/video_replayer_distributed/CMakeLists.txt
index 952a4baf..4e4e14d7 100644
--- a/examples/video_replayer_distributed/CMakeLists.txt
+++ b/examples/video_replayer_distributed/CMakeLists.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(cpp)
-if(HOLOSCAN_BUILD_PYTHON)
+if(HOLOSCAN_CPP_EXAMPLES)
+  add_subdirectory(cpp)
+endif()
+
+if(HOLOSCAN_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
 
diff --git a/examples/video_replayer_distributed/README.md b/examples/video_replayer_distributed/README.md
index 3f19ab25..6150fa36 100644
--- a/examples/video_replayer_distributed/README.md
+++ b/examples/video_replayer_distributed/README.md
@@ -4,7 +4,7 @@ Minimal example to demonstrate the use of the video stream replayer operator to
 
 The video frames need to have been converted to a gxf entity format to use as input. You can use the `convert_video_to_gxf_entities.py` script installed in `/opt/nvidia/holoscan/bin` or available [on GitHub](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/scripts#convert_video_to_gxf_entitiespy) (tensors will be loaded on the GPU).
 
-> Note: Support for H264 stream support is in progress and can be found on [HoloHub](https://nvidia-holoscan.github.io/holohub)
+> Note: Support for H264 stream support is in progress and can be found on [HoloHub](https://github.com/nvidia-holoscan/holohub)
 
 *Visit the [SDK User Guide](https://docs.nvidia.com/holoscan/sdk-user-guide/holoscan_create_distributed_app.html) to learn more about distributed applications.*
 
diff --git a/examples/video_replayer_distributed/cpp/CMakeLists.txt b/examples/video_replayer_distributed/cpp/CMakeLists.txt
index 47520466..875007b9 100644
--- a/examples/video_replayer_distributed/cpp/CMakeLists.txt
+++ b/examples/video_replayer_distributed/cpp/CMakeLists.txt
@@ -72,7 +72,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME video_replayer_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -85,7 +85,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT video_replayer_distributed_test.cpp
     PRE_LINK
     COMMAND patch -u -o video_replayer_distributed_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/video_replayer_distributed.cpp
-            ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/cpp_video_replayer_distributed.patch
+    ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/cpp_video_replayer_distributed.patch
   )
 
   # Create the test executable
@@ -94,7 +94,7 @@ if(HOLOSCAN_BUILD_TESTS)
   )
 
   target_include_directories(video_replayer_distributed_test
-    PRIVATE ${CMAKE_SOURCE_DIR}/tests)
+    PRIVATE ${Holoscan-examples_SOURCE_DIR}/../tests)
 
   target_compile_definitions(video_replayer_distributed_test
     PRIVATE RECORD_OUTPUT RECORDING_DIR="${RECORDING_DIR}"
@@ -125,9 +125,22 @@ if(HOLOSCAN_BUILD_TESTS)
     FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
   )
 
+  # repeat the default test, but with HOLOSCAN_UCX_ASYNCHRONOUS=False to set
+  # enable_async=false for UcxContext
+  add_test(NAME EXAMPLE_CPP_VIDEO_REPLAYER_DISTRIBUTED_SYNCHRONOUS_MODE_TEST
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/video_replayer_distributed_test --config ${CONFIG_FILE}
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+  )
+
+  set_tests_properties(EXAMPLE_CPP_VIDEO_REPLAYER_DISTRIBUTED_SYNCHRONOUS_MODE_TEST PROPERTIES
+    ENVIRONMENT "HOLOSCAN_UCX_ASYNCHRONOUS=False"
+    PASS_REGULAR_EXPRESSION "Reach end of file or playback count reaches to the limit. Stop ticking."
+    FAIL_REGULAR_EXPRESSION "initialized independent of a parent entity"
+  )
+
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_CPP_VIDEO_REPLAYER_DISTRIBUTED_RENDER_TEST
-      COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+      COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
       --source_video_dir ${RECORDING_DIR}
       --source_video_basename ${SOURCE_VIDEO_BASENAME}
       --output_dir ${RECORDING_DIR}
diff --git a/examples/video_replayer_distributed/cpp/video_replayer_distributed.cpp b/examples/video_replayer_distributed/cpp/video_replayer_distributed.cpp
index 804359e4..b4a9048f 100644
--- a/examples/video_replayer_distributed/cpp/video_replayer_distributed.cpp
+++ b/examples/video_replayer_distributed/cpp/video_replayer_distributed.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,7 +30,7 @@ class Fragment1 : public holoscan::Fragment {
     using namespace holoscan;
 
     ArgList args;
-    auto data_directory = std::getenv("HOLOSCAN_INPUT_PATH");
+    auto* data_directory = std::getenv("HOLOSCAN_INPUT_PATH");  // NOLINT(*)
     if (data_directory != nullptr && data_directory[0] != '\0') {
       auto video_directory = std::filesystem::path(data_directory);
       video_directory /= "racerx";
@@ -67,7 +67,7 @@ class DistributedVideoReplayerApp : public holoscan::Application {
   }
 };
 
-int main(int argc, char** argv) {
+int main([[maybe_unused]] int argc, char** argv) {
   // Get the yaml configuration file
   auto config_path = std::filesystem::canonical(argv[0]).parent_path();
   config_path /= std::filesystem::path("video_replayer_distributed.yaml");
diff --git a/examples/video_replayer_distributed/python/CMakeLists.txt b/examples/video_replayer_distributed/python/CMakeLists.txt
index 04ad5953..7641d886 100644
--- a/examples/video_replayer_distributed/python/CMakeLists.txt
+++ b/examples/video_replayer_distributed/python/CMakeLists.txt
@@ -53,7 +53,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   set(RECORDING_DIR ${CMAKE_CURRENT_BINARY_DIR}/recording_output)
   set(SOURCE_VIDEO_BASENAME python_video_replayer_distributed_output)
-  set(VALIDATION_FRAMES_DIR ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/)
+  set(VALIDATION_FRAMES_DIR ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/)
 
   file(MAKE_DIRECTORY ${RECORDING_DIR})
 
@@ -67,7 +67,7 @@ if(HOLOSCAN_BUILD_TESTS)
   add_custom_command(OUTPUT video_replayer_distributed_test.py
     PRE_LINK
     COMMAND patch -u -o video_replayer_distributed_test.py ${CMAKE_CURRENT_SOURCE_DIR}/video_replayer_distributed.py
-        ${CMAKE_SOURCE_DIR}/tests/data/validation_frames/video_replayer/python_video_replayer_distributed.patch
+      ${Holoscan-examples_SOURCE_DIR}/../tests/data/validation_frames/video_replayer/python_video_replayer_distributed.patch
   )
 
   add_custom_target(python_video_replayer_distributed_test ALL
@@ -87,7 +87,7 @@ if(HOLOSCAN_BUILD_TESTS)
 
   # Add a test to check the validity of the frames
   add_test(NAME EXAMPLE_PYTHON_VIDEO_REPLAYER_DISTRIBUTED_RENDER_TEST
-    COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/video_validation.py
+    COMMAND python3 ${Holoscan-examples_SOURCE_DIR}/../scripts/video_validation.py
     --source_video_dir ${RECORDING_DIR}
     --source_video_basename ${SOURCE_VIDEO_BASENAME}
     --output_dir ${RECORDING_DIR}
diff --git a/gxf_extensions/CMakeLists.txt b/gxf_extensions/CMakeLists.txt
index f802807b..1700dd8a 100644
--- a/gxf_extensions/CMakeLists.txt
+++ b/gxf_extensions/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 # For install_gxf_extension
-# TODO: move that function to its own CMake module/file
+# TODO(unknown): move that function to its own CMake module/file
 include(GenerateGXEApp)
 
 list(APPEND CMAKE_INSTALL_RPATH_LIST
diff --git a/gxf_extensions/gxf_holoscan_wrapper/CMakeLists.txt b/gxf_extensions/gxf_holoscan_wrapper/CMakeLists.txt
index 7ddbfdfc..1022c014 100644
--- a/gxf_extensions/gxf_holoscan_wrapper/CMakeLists.txt
+++ b/gxf_extensions/gxf_holoscan_wrapper/CMakeLists.txt
@@ -42,7 +42,7 @@ target_include_directories(gxf_holoscan_wrapper_lib
     PUBLIC
     # Include headers from the parent directory (./gxf_extensions)
     # so that the headers can be included as <gxf_holoscan_wrapper/operator_wrapper.hpp>
-    # TODO: need to find better way to do this
+    # TODO(unknown): need to find better way to do this
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
diff --git a/gxf_extensions/gxf_holoscan_wrapper/operator_wrapper.cpp b/gxf_extensions/gxf_holoscan_wrapper/operator_wrapper.cpp
index 4955b129..87eafd32 100644
--- a/gxf_extensions/gxf_holoscan_wrapper/operator_wrapper.cpp
+++ b/gxf_extensions/gxf_holoscan_wrapper/operator_wrapper.cpp
@@ -22,9 +22,9 @@
 #include <vector>
 
 #include "gxf/cuda/cuda_stream_pool.hpp"
+#include "gxf/std/block_memory_pool.hpp"
 #include "gxf/std/scheduling_term.hpp"
 #include "gxf/std/unbounded_allocator.hpp"
-// #include "gxf/std/block_memory_pool.hpp" // TODO: uncomment when available in GXF SDK package
 
 #include "holoscan/core/common.hpp"
 #include "holoscan/core/conditions/gxf/asynchronous.hpp"
diff --git a/gxf_extensions/ucx/ucx_holoscan_component_serializer.cpp b/gxf_extensions/ucx/ucx_holoscan_component_serializer.cpp
index 1860bef3..33795f51 100644
--- a/gxf_extensions/ucx/ucx_holoscan_component_serializer.cpp
+++ b/gxf_extensions/ucx/ucx_holoscan_component_serializer.cpp
@@ -23,6 +23,8 @@
 #include <thread>
 #include <utility>
 
+#include <gxf/ucx/ucx_serialization_buffer.hpp>
+
 #include "holoscan/utils/timer.hpp"
 
 namespace nvidia {
@@ -55,6 +57,9 @@ Expected<void> UcxHoloscanComponentSerializer::configureSerializers() {
         return serializeMetadataDictionary(*static_cast<holoscan::MetadataDictionary*>(component),
                                            endpoint);
       });
+  result &= setSerializer<holoscan::MessageLabel>([this](void* component, Endpoint* endpoint) {
+    return serializeMessageLabel(*static_cast<holoscan::MessageLabel*>(component), endpoint);
+  });
   return result;
 }
 
@@ -69,6 +74,11 @@ Expected<void> UcxHoloscanComponentSerializer::configureDeserializers() {
         return deserializeMetadataDictionary(endpoint).assign_to(
             *static_cast<holoscan::MetadataDictionary*>(component));
       });
+
+  result &= setDeserializer<holoscan::MessageLabel>([this](void* component, Endpoint* endpoint) {
+    return deserializeMessageLabel(endpoint).assign_to(
+        *static_cast<holoscan::MessageLabel*>(component));
+  });
   return result;
 }
 
@@ -185,5 +195,110 @@ UcxHoloscanComponentSerializer::deserializeMetadataDictionary(Endpoint* endpoint
   return metadata;
 }
 
+Expected<size_t> UcxHoloscanComponentSerializer::serializeMessageLabel(
+    const holoscan::MessageLabel& messagelabel, Endpoint* endpoint) {
+  GXF_LOG_DEBUG("UcxHoloscanComponentSerializer::serializeMessageLabel");
+  size_t total_size = 0;
+
+  // Get the total number of paths in message label and write it first
+  int total_paths = messagelabel.num_paths();
+  auto maybe_size = endpoint->writeTrivialType<int>(&total_paths);
+  if (!maybe_size) { return ForwardError(maybe_size); }
+  total_size += maybe_size.value();
+
+  // for every path in message label, write the number of operators in the path first.
+  // then, write all operator timestamps in a path
+  for (const auto& path : messagelabel.paths()) {
+    uint32_t num_operators = path.size();
+    maybe_size = endpoint->writeTrivialType<uint32_t>(&num_operators);
+    if (!maybe_size) { return ForwardError(maybe_size); }
+    total_size += maybe_size.value();
+    for (const auto& optimestamp : path) {
+      maybe_size = serializeOperatorTimestampLabel(optimestamp, endpoint);
+      if (!maybe_size) { return ForwardError(maybe_size); }
+      total_size += maybe_size.value();
+    }
+  }
+  // check the total_size <= 8KB - This is an UCX transfer limitation for non-tensor data
+  auto ucx_buf = dynamic_cast<nvidia::gxf::UcxSerializationBuffer*>(endpoint);
+  if (!ucx_buf) {
+    GXF_LOG_ERROR("Dynamic cast of Endpoint* to nvidia::gxf::UcxSerializationBuffer* failed");
+    return Unexpected{GXF_FAILURE};
+  }
+  size_t buffer_capacity = ucx_buf->capacity();
+  if (total_size > buffer_capacity) {
+    GXF_LOG_ERROR(
+        "MessageLabel size of %zu bytes exceeds the current UCX serialization buffer capacity of "
+        "%zu bytes. You can try to increase the buffer capacity by setting the "
+        "HOLOSCAN_UCX_SERIALIZATION_BUFFER_SIZE environment variable, but the limit it is "
+        "possible to set will depend on the maximum header size supported by the underlying "
+        "ucp_send_am_nbx function of UCX.",
+        total_size,
+        buffer_capacity);
+    return Unexpected{GXF_FAILURE};
+  }
+  return total_size;
+}
+
+Expected<holoscan::MessageLabel> UcxHoloscanComponentSerializer::deserializeMessageLabel(
+    Endpoint* endpoint) {
+  GXF_LOG_DEBUG("UcxHoloscanComponentSerializer::deserializeHoloscanMessageLabel");
+  holoscan::MessageLabel messagelabel;
+
+  int total_paths;
+  auto size = endpoint->readTrivialType<int>(&total_paths);
+  if (!size) { return ForwardError(size); }
+  for (int i = 0; i < total_paths; i++) {
+    uint32_t num_operators;
+    auto size = endpoint->readTrivialType<uint32_t>(&num_operators);
+    if (!size) { return ForwardError(size); }
+    holoscan::MessageLabel::TimestampedPath path;
+    for (int j = 0; j < num_operators; j++) {
+      auto maybe_optimestamp = deserializeOperatorTimestampLabel(endpoint);
+      if (!maybe_optimestamp) { return ForwardError(maybe_optimestamp); }
+      path.push_back(maybe_optimestamp.value());
+    }
+    messagelabel.add_new_path(path);
+  }
+  return messagelabel;
+}
+
+Expected<size_t> UcxHoloscanComponentSerializer::serializeOperatorTimestampLabel(
+    const holoscan::OperatorTimestampLabel& operatortimestamplabel, Endpoint* endpoint) {
+  GXF_LOG_DEBUG("UcxHoloscanComponentSerializer::serializeOperatorTimestampLabel");
+  auto maybe_size = serialize_string(operatortimestamplabel.operator_name, endpoint);
+  if (!maybe_size) { return ForwardError(maybe_size); }
+  size_t total_size = maybe_size.value();
+  maybe_size = endpoint->writeTrivialType<int64_t>(&operatortimestamplabel.rec_timestamp);
+  if (!maybe_size) { return ForwardError(maybe_size); }
+  total_size += maybe_size.value();
+  maybe_size = endpoint->writeTrivialType<int64_t>(&operatortimestamplabel.pub_timestamp);
+  if (!maybe_size) { return ForwardError(maybe_size); }
+  total_size += maybe_size.value();
+  return total_size;
+}
+
+Expected<holoscan::OperatorTimestampLabel>
+UcxHoloscanComponentSerializer::deserializeOperatorTimestampLabel(Endpoint* endpoint) {
+  GXF_LOG_DEBUG("UcxHoloscanComponentSerializer::deserializeOperatorTimestampLabel");
+  holoscan::OperatorTimestampLabel operatortimestamplabel;
+
+  auto maybe_operator_name = deserialize_string(endpoint);
+  if (!maybe_operator_name) { return ForwardError(maybe_operator_name); }
+  operatortimestamplabel.operator_name = maybe_operator_name.value();
+
+  int64_t maybe_rec_timestamp;
+  auto size = endpoint->readTrivialType<int64_t>(&maybe_rec_timestamp);
+  if (!size) { return ForwardError(size); }
+  operatortimestamplabel.rec_timestamp = maybe_rec_timestamp;
+
+  int64_t maybe_pub_timestamp;
+  size = endpoint->readTrivialType<int64_t>(&maybe_pub_timestamp);
+  if (!size) { return ForwardError(size); }
+  operatortimestamplabel.pub_timestamp = maybe_pub_timestamp;
+
+  return operatortimestamplabel;
+}
+
 }  // namespace gxf
 }  // namespace nvidia
diff --git a/gxf_extensions/ucx/ucx_holoscan_component_serializer.hpp b/gxf_extensions/ucx/ucx_holoscan_component_serializer.hpp
index 3125a732..0dfd6bfc 100644
--- a/gxf_extensions/ucx/ucx_holoscan_component_serializer.hpp
+++ b/gxf_extensions/ucx/ucx_holoscan_component_serializer.hpp
@@ -26,6 +26,7 @@
 #include "gxf/std/tensor.hpp"
 #include "holoscan/core/codec_registry.hpp"
 #include "holoscan/core/message.hpp"
+#include "holoscan/core/messagelabel.hpp"
 #include "holoscan/core/metadata.hpp"
 
 namespace nvidia {
@@ -54,6 +55,16 @@ class UcxHoloscanComponentSerializer : public ComponentSerializer {
                                                Endpoint* endpoint);
   // Deserializes a holoscan::MetadataDictionary
   Expected<holoscan::MetadataDictionary> deserializeMetadataDictionary(Endpoint* endpoint);
+  // Serializes a holoscan::MessageLabel
+  Expected<size_t> serializeMessageLabel(const holoscan::MessageLabel& messagelabel,
+                                         Endpoint* endpoint);
+  // Deserializes a holoscan::MessageLabel
+  Expected<holoscan::MessageLabel> deserializeMessageLabel(Endpoint* endpoint);
+  // Serializes a holoscan::OperatorTimestampLabel
+  Expected<size_t> serializeOperatorTimestampLabel(
+      const holoscan::OperatorTimestampLabel& operatortimestamplabel, Endpoint* endpoint);
+  // Deserializes a holoscan::OperatorTimestampLabel
+  Expected<holoscan::OperatorTimestampLabel> deserializeOperatorTimestampLabel(Endpoint* endpoint);
 
   Parameter<Handle<Allocator>> allocator_;
 };
diff --git a/include/holoscan/core/application.hpp b/include/holoscan/core/application.hpp
index b2f6f87b..083a8a17 100644
--- a/include/holoscan/core/application.hpp
+++ b/include/holoscan/core/application.hpp
@@ -18,15 +18,17 @@
 #ifndef HOLOSCAN_CORE_APPLICATION_HPP
 #define HOLOSCAN_CORE_APPLICATION_HPP
 
-#include <iostream>     // for std::cout
-#include <memory>       // for std::shared_ptr
-#include <set>          // for std::set
-#include <string>       // for std::string
-#include <type_traits>  // for std::enable_if_t, std::is_constructible
-#include <utility>      // for std::pair
-#include <vector>       // for std::vector
+#include <iostream>       // for std::cout
+#include <memory>         // for std::shared_ptr
+#include <set>            // for std::set
+#include <string>         // for std::string
+#include <type_traits>    // for std::enable_if_t, std::is_constructible
+#include <unordered_map>  // for std::unordered_map
+#include <utility>        // for std::pair
+#include <vector>         // for std::vector
 
 #include "./fragment.hpp"
+#include "dataflow_tracker.hpp"
 
 #include "./app_worker.hpp"
 #include "./cli_parser.hpp"
@@ -286,6 +288,22 @@ class Application : public Fragment {
 
   std::future<void> run_async() override;
 
+  /**
+   * @brief Returns a map of fragment names to DataFlowTracker* corresponding to respective
+   * fragments. The trackers will store cumulatively progressive timestamps, meaning a fragment
+   * tracker will store the timestamps of operators in the previous fragments as well.
+   *
+   * @param num_start_messages_to_skip The number of start messages to skip.
+   * @param num_last_messages_to_discard The number of last messages to discard.
+   * @param latency_threshold The latency threshold.
+   * @return std::unordered_map<std::string, DataFlowTracker*> Fragment name to DataFlowTracker*
+   * mapping.
+   */
+  std::unordered_map<std::string, DataFlowTracker*> track_distributed(
+      uint64_t num_start_messages_to_skip = kDefaultNumStartMessagesToSkip,
+      uint64_t num_last_messages_to_discard = kDefaultNumLastMessagesToDiscard,
+      int latency_threshold = kDefaultLatencyThreshold);
+
  protected:
   friend class AppDriver;
   friend class AppWorker;
diff --git a/include/holoscan/core/arg.hpp b/include/holoscan/core/arg.hpp
index ae3698eb..a04feeb5 100644
--- a/include/holoscan/core/arg.hpp
+++ b/include/holoscan/core/arg.hpp
@@ -469,7 +469,7 @@ class ArgList {
    * Example:
    *
    * ```cpp
-   * bool is_rdma = from_config("aja.rdma").as<bool>();
+   * auto is_rdma = from_config("aja.rdma").as<bool>();
    * ```
    *
    * @tparam typeT The type to cast the argument to.
diff --git a/include/holoscan/core/argument_setter.hpp b/include/holoscan/core/argument_setter.hpp
index 6be8cb10..d9331a55 100644
--- a/include/holoscan/core/argument_setter.hpp
+++ b/include/holoscan/core/argument_setter.hpp
@@ -59,10 +59,8 @@ class ArgumentSetter {
   /**
    * @brief Default @ref SetterFunc for Arg.
    */
-  inline static SetterFunc none_argument_setter = [](ParameterWrapper& param_wrap, Arg& arg) {
-    (void)param_wrap;
-    (void)arg;
-
+  inline static SetterFunc none_argument_setter = []([[maybe_unused]] ParameterWrapper& param_wrap,
+                                                     Arg& arg) {
     HOLOSCAN_LOG_ERROR("Unable to handle parameter: {}", arg.name());
   };
 
@@ -266,7 +264,8 @@ class ArgumentSetter {
                           typename holoscan::type_info<typeT>::derived_type>(arg_value);
                       // Initialize the condition in case the condition created by
                       // Fragment::make_condition<T>() is added to the operator as an argument.
-                      // TODO: would like this to be assigned to the same entity as the operator
+                      // TODO(unknown): would like this to be assigned to the same entity as the
+                      // operator
                       if (converted_value) { converted_value->initialize(); }
 
                       param = converted_value;
@@ -282,7 +281,8 @@ class ArgumentSetter {
                           typename holoscan::type_info<typeT>::derived_type>(arg_value);
                       // Initialize the resource in case the resource created by
                       // Fragment::make_resource<T>() is added to the operator as an argument.
-                      // TODO: would like this to be assigned to the same entity as the operator
+                      // TODO(unknown): would like this to be assigned to the same entity as the
+                      // operator
                       if (converted_value) { converted_value->initialize(); }
 
                       param = converted_value;
@@ -391,7 +391,8 @@ class ArgumentSetter {
 
                         // Initialize the condition in case the condition created by
                         // Fragment::make_condition<T>() is added to the operator as an argument.
-                        // TODO: would like this to be assigned to the same entity as the operator
+                        // TODO(unknown): would like this to be assigned to the same entity as the
+                        // operator
                         if (condition) { condition->initialize(); }
 
                         converted_value.push_back(condition);
@@ -414,7 +415,8 @@ class ArgumentSetter {
 
                         // Initialize the resource in case the resource created by
                         // Fragment::make_resource<T>() is added to the operator as an argument.
-                        // TODO: would like this to be assigned to the same entity as the operator
+                        // TODO(unknown): would like this to be assigned to the same entity as the
+                        // operator
                         if (resource) { resource->initialize(); }
 
                         converted_value.push_back(resource);
diff --git a/include/holoscan/core/codecs.hpp b/include/holoscan/core/codecs.hpp
index 5bccecd2..f8f1793c 100644
--- a/include/holoscan/core/codecs.hpp
+++ b/include/holoscan/core/codecs.hpp
@@ -70,7 +70,7 @@ static inline expected<typeT, RuntimeError> deserialize_trivial_type(Endpoint* e
   return encoded;
 }
 
-// TODO: currently not handling integer types separately
+// TODO(unknown): currently not handling integer types separately
 template <typename typeT>
 struct codec {
   static expected<size_t, RuntimeError> serialize(const typeT& value, Endpoint* endpoint) {
diff --git a/include/holoscan/core/component.hpp b/include/holoscan/core/component.hpp
index 2ff26121..75879d93 100644
--- a/include/holoscan/core/component.hpp
+++ b/include/holoscan/core/component.hpp
@@ -41,12 +41,12 @@
                  std::is_same_v<::holoscan::ArgList, std::decay_t<ArgT>>)>>
 #define HOLOSCAN_COMPONENT_FORWARD_ARGS(class_name) \
   HOLOSCAN_COMPONENT_FORWARD_TEMPLATE()             \
-  class_name(ArgT&& arg, ArgsT&&... args)           \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)  \
       : Component(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...)
 
 #define HOLOSCAN_COMPONENT_FORWARD_ARGS_SUPER(class_name, super_class_name) \
   HOLOSCAN_COMPONENT_FORWARD_TEMPLATE()                                     \
-  class_name(ArgT&& arg, ArgsT&&... args)                                   \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)                          \
       : super_class_name(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...)
 
 namespace holoscan {
diff --git a/include/holoscan/core/condition.hpp b/include/holoscan/core/condition.hpp
index accc6ba2..fea7eb49 100644
--- a/include/holoscan/core/condition.hpp
+++ b/include/holoscan/core/condition.hpp
@@ -55,7 +55,7 @@
  */
 #define HOLOSCAN_CONDITION_FORWARD_ARGS(class_name) \
   HOLOSCAN_CONDITION_FORWARD_TEMPLATE()             \
-  class_name(ArgT&& arg, ArgsT&&... args)           \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)  \
       : Condition(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 /**
@@ -91,7 +91,7 @@
  */
 #define HOLOSCAN_CONDITION_FORWARD_ARGS_SUPER(class_name, super_class_name) \
   HOLOSCAN_CONDITION_FORWARD_TEMPLATE()                                     \
-  class_name(ArgT&& arg, ArgsT&&... args)                                   \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)                          \
       : super_class_name(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 namespace holoscan {
@@ -224,7 +224,7 @@ class Condition : public Component {
    *
    * @param spec The reference to the component specification.
    */
-  virtual void setup(ComponentSpec& spec) { (void)spec; }
+  virtual void setup([[maybe_unused]] ComponentSpec& spec) {}
 
   /**
    * @brief Get a YAML representation of the condition.
diff --git a/include/holoscan/core/conditions/gxf/asynchronous.hpp b/include/holoscan/core/conditions/gxf/asynchronous.hpp
index a1bd17e1..1b4e5629 100644
--- a/include/holoscan/core/conditions/gxf/asynchronous.hpp
+++ b/include/holoscan/core/conditions/gxf/asynchronous.hpp
@@ -58,7 +58,6 @@ class AsynchronousCondition : public gxf::GXFCondition {
   AsynchronousCondition(const std::string& name, nvidia::gxf::AsynchronousSchedulingTerm* term);
 
   const char* gxf_typename() const override { return "nvidia::gxf::AsynchronousSchedulingTerm"; }
-  void setup(ComponentSpec& spec) override;
 
   /**
    * @brief Set the condition's asynchronous event state.
diff --git a/include/holoscan/core/conditions/gxf/cuda_buffer_available.hpp b/include/holoscan/core/conditions/gxf/cuda_buffer_available.hpp
new file mode 100644
index 00000000..c7448de0
--- /dev/null
+++ b/include/holoscan/core/conditions/gxf/cuda_buffer_available.hpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_BUFFER_AVAILABLE_HPP
+#define HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_BUFFER_AVAILABLE_HPP
+
+#include <memory>
+#include <string>
+
+#include <gxf/cuda/cuda_scheduling_terms.hpp>
+
+#include "../../component_spec.hpp"
+#include "../../gxf/gxf_condition.hpp"
+#include "../../gxf/gxf_resource.hpp"
+
+namespace holoscan {
+
+/*TODO(Greg): Can only use CudaBufferAvailableCondition when the message Entity contains a
+ * CudaBuffer component. Would need to update HoloscanSDK to support a way to use this type of
+ * object via Holoscan APIs in a way that is convenient to combine with existing TensorMap
+ * functionality
+ *
+ * See, for example gxf/cuda/tests/test_cuda_helper.hpp's CudaAsyncBufferGenerator
+ */
+
+/**
+ * @brief Condition based on data availability in a cuda buffer.
+ *
+ * A component which specifies the availability of data at the receiver based on the cuda buffers
+ * present in incoming messages.
+ */
+class CudaBufferAvailableCondition : public gxf::GXFCondition {
+ public:
+  HOLOSCAN_CONDITION_FORWARD_ARGS_SUPER(CudaBufferAvailableCondition, GXFCondition)
+
+  CudaBufferAvailableCondition() = default;
+  CudaBufferAvailableCondition(const std::string& name,
+                               nvidia::gxf::CudaBufferAvailableSchedulingTerm* term);
+
+  const char* gxf_typename() const override {
+    return "nvidia::gxf::CudaBufferAvailableSchedulingTerm";
+  }
+  void setup(ComponentSpec& spec) override;
+
+  void receiver(std::shared_ptr<gxf::GXFResource> receiver) { receiver_ = receiver; }
+  std::shared_ptr<gxf::GXFResource> receiver() { return receiver_.get(); }
+
+  nvidia::gxf::CudaBufferAvailableSchedulingTerm* get() const;
+
+ private:
+  Parameter<std::shared_ptr<gxf::GXFResource>> receiver_;
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_BUFFER_AVAILABLE_HPP */
diff --git a/include/holoscan/core/conditions/gxf/cuda_event.hpp b/include/holoscan/core/conditions/gxf/cuda_event.hpp
new file mode 100644
index 00000000..b3caed6a
--- /dev/null
+++ b/include/holoscan/core/conditions/gxf/cuda_event.hpp
@@ -0,0 +1,76 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_EVENT_HPP
+#define HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_EVENT_HPP
+
+#include <memory>
+#include <string>
+
+#include <gxf/cuda/cuda_scheduling_terms.hpp>
+
+#include "../../component_spec.hpp"
+#include "../../gxf/gxf_condition.hpp"
+#include "../../gxf/gxf_resource.hpp"
+
+namespace holoscan {
+
+/**
+ * TODO(Greg): This condition requires there be a CudaEvent component in the message Entity.
+ *
+ * e.g. it calls
+ *    auto maybe_event = message->get<CudaEvent>(event_name_.get().c_str());
+ *
+ * See StreamBasedOps codelet in gxf/cuda/tests/test_cuda_helper.hpp
+ *    specifically the methods `addNewEvent` and `initOpsEvent` and how they are used in the
+ *    operators that inherit from StreamBasedOps
+ *
+ * We have not yet exposed CudaEvent object from Holoscan. Need to provide a convenient way to use
+ * it.
+ */
+
+/**
+ * @brief Condition class to indicate data availability on CUDA stream completion via an event.
+ *
+ * A condition which specifies the availability of data at the receiver on completion of the
+ * work on the provided cuda stream with the help of cuda event.
+ * This condition will keep polling on the event provided to check for data availability for
+ * consumption.
+ */
+class CudaEventCondition : public gxf::GXFCondition {
+ public:
+  HOLOSCAN_CONDITION_FORWARD_ARGS_SUPER(CudaEventCondition, GXFCondition)
+
+  CudaEventCondition() = default;
+  CudaEventCondition(const std::string& name, nvidia::gxf::CudaEventSchedulingTerm* term);
+
+  const char* gxf_typename() const override { return "nvidia::gxf::CudaEventSchedulingTerm"; }
+  void setup(ComponentSpec& spec) override;
+
+  void receiver(std::shared_ptr<gxf::GXFResource> receiver) { receiver_ = receiver; }
+  std::shared_ptr<gxf::GXFResource> receiver() { return receiver_.get(); }
+
+  nvidia::gxf::CudaEventSchedulingTerm* get() const;
+
+ private:
+  Parameter<std::shared_ptr<gxf::GXFResource>> receiver_;
+  Parameter<std::string> event_name_;
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_EVENT_HPP */
diff --git a/include/holoscan/core/conditions/gxf/cuda_stream.hpp b/include/holoscan/core/conditions/gxf/cuda_stream.hpp
new file mode 100644
index 00000000..1f78208f
--- /dev/null
+++ b/include/holoscan/core/conditions/gxf/cuda_stream.hpp
@@ -0,0 +1,68 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_STREAM_HPP
+#define HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_STREAM_HPP
+
+#include <memory>
+#include <string>
+
+#include <gxf/cuda/cuda_scheduling_terms.hpp>
+
+#include "../../component_spec.hpp"
+#include "../../gxf/gxf_condition.hpp"
+#include "../../gxf/gxf_resource.hpp"
+
+namespace holoscan {
+
+/**
+ * TODO(Greg): This condition requires there be a CudaStreamId component in the message Entity.
+ *
+ * e.g., it calls
+ *     auto stream_id = message->get<CudaStreamId>();
+ *
+ * Need to check if this works as-is with the existing Holoscan CudaStreamHandler utility
+ */
+
+/**
+ * @brief Condition class to indicate data availability on CUDA stream completion.
+ *
+ * This condition will register a call back function which will be called once the work on the
+ * specified CUDA stream completes indicating that the data is available for consumption
+ */
+class CudaStreamCondition : public gxf::GXFCondition {
+ public:
+  HOLOSCAN_CONDITION_FORWARD_ARGS_SUPER(CudaStreamCondition, GXFCondition)
+
+  CudaStreamCondition() = default;
+  CudaStreamCondition(const std::string& name, nvidia::gxf::CudaStreamSchedulingTerm* term);
+
+  const char* gxf_typename() const override { return "nvidia::gxf::CudaStreamSchedulingTerm"; }
+  void setup(ComponentSpec& spec) override;
+
+  void receiver(std::shared_ptr<gxf::GXFResource> receiver) { receiver_ = receiver; }
+  std::shared_ptr<gxf::GXFResource> receiver() { return receiver_.get(); }
+
+  nvidia::gxf::CudaStreamSchedulingTerm* get() const;
+
+ private:
+  Parameter<std::shared_ptr<gxf::GXFResource>> receiver_;
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_CONDITIONS_GXF_CUDA_STREAM_HPP */
diff --git a/include/holoscan/core/dataflow_tracker.hpp b/include/holoscan/core/dataflow_tracker.hpp
index 75f19ce2..cb6a530b 100644
--- a/include/holoscan/core/dataflow_tracker.hpp
+++ b/include/holoscan/core/dataflow_tracker.hpp
@@ -31,6 +31,7 @@
 #include <vector>
 
 #include "./forward_def.hpp"
+#include "holoscan/core/flow_tracking_annotation.hpp"
 
 namespace holoscan {
 
@@ -200,6 +201,9 @@ class DataFlowTracker {
   // because the cyclic paths are updated from there, instead of DFFTCollector
   friend class AnnotatedDoubleBufferReceiver;
 
+  friend gxf_result_t deannotate_message(gxf_uid_t* uid, const gxf_context_t& context, Operator* op,
+                                         const char* name);
+
   /**
    * @brief Update the tracker with the current latency for a given path.
    *
diff --git a/include/holoscan/core/domain/tensor.hpp b/include/holoscan/core/domain/tensor.hpp
index ca2afc60..e36113d6 100644
--- a/include/holoscan/core/domain/tensor.hpp
+++ b/include/holoscan/core/domain/tensor.hpp
@@ -30,7 +30,7 @@
 
 namespace holoscan {
 
-// TODO: keep old class name as an alias?
+// TODO(unknown): keep old class name as an alias?
 //       also differs in that DLManagedTensorContext has additional members dl_shape and dl_strides
 // using DLManagedTensorCtx = nvidia::gxf::DLManagedTensorContext;
 using DLManagedTensorContext = nvidia::gxf::DLManagedTensorContext;
diff --git a/include/holoscan/core/executors/gxf/gxf_parameter_adaptor.hpp b/include/holoscan/core/executors/gxf/gxf_parameter_adaptor.hpp
index b1d0ff94..031637e0 100644
--- a/include/holoscan/core/executors/gxf/gxf_parameter_adaptor.hpp
+++ b/include/holoscan/core/executors/gxf/gxf_parameter_adaptor.hpp
@@ -43,18 +43,13 @@ class GXFParameterAdaptor {
       std::function<gxf_result_t(gxf_context_t context, gxf_uid_t uid, const char* key,
                                  const ArgType& arg_type, const std::any& any_value)>;
 
-  inline static AdaptFunc none_param_handler = [](gxf_context_t context, gxf_uid_t uid,
-                                                  const char* key, const ArgType& arg_type,
-                                                  const std::any& any_value) {
-    (void)context;
-    (void)uid;
-    (void)key;
-    (void)arg_type;
-    (void)any_value;
-    HOLOSCAN_LOG_ERROR("Unable to handle parameter: {}", key);
-
-    return GXF_FAILURE;
-  };
+  inline static AdaptFunc none_param_handler =
+      []([[maybe_unused]] gxf_context_t context, [[maybe_unused]] gxf_uid_t uid, const char* key,
+         [[maybe_unused]] const ArgType& arg_type, [[maybe_unused]] const std::any& any_value) {
+        HOLOSCAN_LOG_ERROR("Unable to handle parameter: {}", key);
+
+        return GXF_FAILURE;
+      };
 
   /**
    * @brief Get the instance of GXFParameterAdaptor.
@@ -126,13 +121,11 @@ class GXFParameterAdaptor {
 
   template <typename typeT>
   void add_param_handler() {
-    const AdaptFunc& func = [](gxf_context_t context,
-                               gxf_uid_t uid,
+    const AdaptFunc& func = []([[maybe_unused]] gxf_context_t context,
+                               [[maybe_unused]] gxf_uid_t uid,
                                const char* key,
                                const ArgType& arg_type,
                                const std::any& any_value) {
-      (void)context;  // avoid `-Werror=unused-but-set-parameter` due to `constexpr`
-      (void)uid;      // avoid `-Werror=unused-but-set-parameter` due to `constexpr`
       try {
         auto& param = *std::any_cast<Parameter<typeT>*>(any_value);
 
@@ -159,13 +152,11 @@ class GXFParameterAdaptor {
       return GXF_FAILURE;
     };
 
-    const AdaptFunc& arg_func = [](gxf_context_t context,
-                                   gxf_uid_t uid,
+    const AdaptFunc& arg_func = []([[maybe_unused]] gxf_context_t context,
+                                   [[maybe_unused]] gxf_uid_t uid,
                                    const char* key,
                                    const ArgType& arg_type,
                                    const std::any& any_value) {
-      (void)context;  // avoid `-Werror=unused-but-set-parameter` due to `constexpr`
-      (void)uid;      // avoid `-Werror=unused-but-set-parameter` due to `constexpr`
       try {
         typeT value = std::any_cast<typeT>(any_value);
         gxf_result_t result = set_gxf_parameter_value(context, uid, key, arg_type, value);
diff --git a/include/holoscan/core/flow_tracking_annotation.hpp b/include/holoscan/core/flow_tracking_annotation.hpp
new file mode 100644
index 00000000..2e9fc7bd
--- /dev/null
+++ b/include/holoscan/core/flow_tracking_annotation.hpp
@@ -0,0 +1,52 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_FLOW_TRACKING_ANNOTATION_HPP
+#define HOLOSCAN_CORE_FLOW_TRACKING_ANNOTATION_HPP
+
+#include "holoscan/core/gxf/gxf_utils.hpp"
+#include "holoscan/core/operator.hpp"
+
+namespace holoscan {
+
+/**
+ * @brief This function annotates a message with a MessageLabel timestamp.
+ *
+ * @param uid The entity ID of the message.
+ * @param context The GXF context.
+ * @param op The operator that is transmitting the message.
+ * @param transmitter_name The name of the transmitter from which the message is being published.
+ * @return gxf_result_t The result of the annotation.
+ */
+gxf_result_t annotate_message(gxf_uid_t uid, const gxf_context_t& context, Operator* op,
+                              const char* transmitter_name);
+
+/**
+ * @brief This function de-annotates a message and extracts the MessageLabel timestamp. It then
+ * updates necessary data flow tracking information in DataFlowTracker object.
+ *
+ * @param uid The entity ID of the message.
+ * @param context The GXF context.
+ * @param op The operator that is receiving the message.
+ * @param receiver_name The name of the receiver which is receiving the message.
+ */
+gxf_result_t deannotate_message(gxf_uid_t* uid, const gxf_context_t& context, Operator* op,
+                                const char* receiver_name);
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_FLOW_TRACKING_ANNOTATION_HPP */
diff --git a/include/holoscan/core/forward_def.hpp b/include/holoscan/core/forward_def.hpp
index c7254046..91d35d6a 100644
--- a/include/holoscan/core/forward_def.hpp
+++ b/include/holoscan/core/forward_def.hpp
@@ -113,7 +113,11 @@ class GXFOperator;
 class AsynchronousCondition;
 class BooleanCondition;
 class CountCondition;
+class CudaBufferAvailableCondition;
+class CudaEventCondition;
+class CudaStreamCondition;
 class DownstreamMessageAffordableCondition;
+class ExpiringMessageAvailableCondition;
 class MessageAvailableCondition;
 class PeriodicCondition;
 
@@ -122,16 +126,21 @@ class Allocator;
 class AnnotatedDoubleBufferReceiver;
 class AnnotatedDoubleBufferTransmitter;
 class Clock;
+class CudaAllocator;
 class BlockMemoryPool;
 class CudaStreamPool;
 class DoubleBufferReceiver;
 class DoubleBufferTransmitter;
+class HoloscanUcxReceiver;
+class HoloscanUcxTransmitter;
 class ManualClock;
 class Receiver;
 class RealtimeClock;
+class RMMAllocator;
 class SerializationBuffer;
 class StdComponentSerializer;
 class StdEntitySerializer;
+class StreamOrderedAllocator;
 class Transmitter;
 class UcxComponentSerializer;
 class UcxEntitySerializer;
diff --git a/include/holoscan/core/fragment.hpp b/include/holoscan/core/fragment.hpp
index 89dfddff..f0a8e1d8 100644
--- a/include/holoscan/core/fragment.hpp
+++ b/include/holoscan/core/fragment.hpp
@@ -167,7 +167,7 @@ class Fragment {
    * @param prefix The prefix string that is prepended to the key of the configuration. (not
    * implemented yet)
    */
-  void config(const std::string& config_file, const std::string& prefix = "");
+  void config(const std::string& config_file, [[maybe_unused]] const std::string& prefix = "");
 
   /**
    * @brief Set the configuration of the fragment.
@@ -282,7 +282,7 @@ class Fragment {
    * converted to `bool` through `ArgList::as()` method:
    *
    * ```cpp
-   * bool is_rdma = from_config("aja.rdma").as<bool>();
+   * auto is_rdma = from_config("aja.rdma").as<bool>();
    * ```
    *
    * @param key The key of the configuration.
diff --git a/include/holoscan/core/fragment_scheduler.hpp b/include/holoscan/core/fragment_scheduler.hpp
index e2079efc..cf228b11 100644
--- a/include/holoscan/core/fragment_scheduler.hpp
+++ b/include/holoscan/core/fragment_scheduler.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,10 +31,10 @@ namespace holoscan {
 
 struct SystemResourceRequirement {
   std::string fragment_name;
-  float cpu = -1.0f;
-  float cpu_limit = -1.0f;
-  float gpu = -1.0f;
-  float gpu_limit = -1.0f;
+  float cpu = -1.0F;
+  float cpu_limit = -1.0F;
+  float gpu = -1.0F;
+  float gpu_limit = -1.0F;
   uint64_t memory = 0;
   uint64_t memory_limit = 0;
   uint64_t shared_memory = 0;
diff --git a/include/holoscan/core/graphs/flow_graph.hpp b/include/holoscan/core/graphs/flow_graph.hpp
index 0861220b..dcaed077 100644
--- a/include/holoscan/core/graphs/flow_graph.hpp
+++ b/include/holoscan/core/graphs/flow_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,7 +60,7 @@ class FlowGraph : public Graph<NodeT, EdgeDataElementT> {
 
   bool is_root(const NodeType& node) override;
 
-  bool is_user_defined_root(const NodeType& node) {
+  bool is_user_defined_root(const NodeType& node) override {
     return get_nodes().empty() ? false : get_nodes()[0] == node;
   }
 
diff --git a/include/holoscan/core/gxf/entity.hpp b/include/holoscan/core/gxf/entity.hpp
index 57399c85..bbae8072 100644
--- a/include/holoscan/core/gxf/entity.hpp
+++ b/include/holoscan/core/gxf/entity.hpp
@@ -130,7 +130,7 @@ class Entity : public nvidia::gxf::Entity {
 
 // Modified version of the Tensor version of gxf::Entity::get
 // Retrieves a VideoBuffer instead
-// TODO: Support gxf::VideoBuffer natively in Holoscan
+// TODO(unknown): Support gxf::VideoBuffer natively in Holoscan
 nvidia::gxf::Handle<nvidia::gxf::VideoBuffer> get_videobuffer(Entity entity,
                                                               const char* name = nullptr);
 
diff --git a/include/holoscan/core/gxf/gxf_component.hpp b/include/holoscan/core/gxf/gxf_component.hpp
index 18f964ed..f0a296be 100644
--- a/include/holoscan/core/gxf/gxf_component.hpp
+++ b/include/holoscan/core/gxf/gxf_component.hpp
@@ -78,7 +78,7 @@ class GXFComponent {
   gxf_uid_t gxf_cid_ = 0;
   std::shared_ptr<nvidia::gxf::GraphEntity> gxf_graph_entity_;
   std::string gxf_cname_;
-  // TODO: remove gxf_cptr_ and use the Component Handle everywhere instead?
+  // TODO(unknown): remove gxf_cptr_ and use the Component Handle everywhere instead?
   nvidia::gxf::Handle<nvidia::gxf::Component> gxf_component_;
   void* gxf_cptr_ = nullptr;
 };
diff --git a/include/holoscan/core/io_context.hpp b/include/holoscan/core/io_context.hpp
index a80381fc..42c81d95 100644
--- a/include/holoscan/core/io_context.hpp
+++ b/include/holoscan/core/io_context.hpp
@@ -183,7 +183,8 @@ class InputContext {
    *     spec.input<std::shared_ptr<ValueData>>("in");
    *   }
    *
-   *   void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+   *   void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+   *                [[maybe_unused]] ExecutionContext& context) override {
    *     auto value = op_input.receive<std::shared_ptr<ValueData>>("in");
    *     if (value.has_value()) {
    *       HOLOSCAN_LOG_INFO("Message received (value: {})", value->data());
@@ -323,29 +324,17 @@ class InputContext {
 
   inline bool populate_tensor_map(const holoscan::gxf::Entity& gxf_entity,
                                   holoscan::TensorMap& tensor_map) {
-    auto components_expected = gxf_entity.findAll();
-    auto components = components_expected.value();
-
-    for (const auto& component : components) {
-      const auto component_name = component->name();
-
-      // Skip non-tensor components based on specific names
-      std::string_view component_name_view(component_name);
-      if (component_name_view == "metadata_" || component_name_view == "message_label" ||
-          component_name_view == "cuda_stream_id_") {
-        continue;
-      }
-
-      // Attempt to get the Tensor component
-      std::shared_ptr<holoscan::Tensor> holoscan_tensor =
-          gxf_entity.get<holoscan::Tensor>(component_name);
-
-      if (holoscan_tensor) {
-        tensor_map.insert({component_name, holoscan_tensor});
-      } else {
-        HOLOSCAN_LOG_DEBUG("Unable to get tensor component '{}'", component_name);
+    auto tensor_components_expected = gxf_entity.findAllHeap<nvidia::gxf::Tensor>();
+    for (const auto& gxf_tensor : tensor_components_expected.value()) {
+      // Do zero-copy conversion to holoscan::Tensor (as in gxf_entity.get<holoscan::Tensor>())
+      auto maybe_dl_ctx = (*gxf_tensor->get()).toDLManagedTensorContext();
+      if (!maybe_dl_ctx) {
+        HOLOSCAN_LOG_ERROR(
+            "Failed to get std::shared_ptr<DLManagedTensorContext> from nvidia::gxf::Tensor");
         return false;
       }
+      auto holoscan_tensor = std::make_shared<Tensor>(maybe_dl_ctx.value());
+      tensor_map.insert({gxf_tensor->name(), holoscan_tensor});
     }
     return true;
   }
@@ -421,7 +410,7 @@ class InputContext {
         auto gxf_entity = std::any_cast<holoscan::gxf::Entity>(value);
         bool is_tensor_map_populated = populate_tensor_map(gxf_entity, tensor_map);
         if (!is_tensor_map_populated) {
-          auto error_message = fmt::format(
+          error_message = fmt::format(
               "Unable to populate the TensorMap from the received GXF Entity for input '{}:{}'",
               name,
               index);
@@ -605,7 +594,8 @@ class OutputContext {
    *     spec.output<ValueData>("out");
    *   }
    *
-   *   void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+   *   void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+   *                [[maybe_unused]] ExecutionContext& context) override {
    *     auto value = std::make_shared<ValueData>(7);
    *     op_output.emit(value, "out");
    *   }
@@ -649,8 +639,8 @@ class OutputContext {
    *     spec.output<holoscan::gxf::Entity>("out");
    *   }
    *
-   *   void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&)
-   * override
+   *   void compute(InputContext& op_input, OutputContext& op_output,
+   *                [[maybe_unused]] ExecutionContext& context) override
    *   {
    *     // The type of `in_message` is 'holoscan::gxf::Entity'.
    *     auto in_message = op_input.receive<holoscan::gxf::Entity>("in");
@@ -713,7 +703,8 @@ class OutputContext {
    *     spec.output<holoscan::gxf::Entity>("out");
    *   }
    *
-   *   void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override
+   *   void compute(InputContext& op_input, OutputContext& op_output,
+   *                [[maybe_unused]] ExecutionContext& context) override
    *   {
    *     // The type of `in_message` is 'holoscan::gxf::Entity'.
    *     auto in_message = op_input.receive<holoscan::gxf::Entity>("in");
diff --git a/include/holoscan/core/messagelabel.hpp b/include/holoscan/core/messagelabel.hpp
index 96e9ece3..530d9c84 100644
--- a/include/holoscan/core/messagelabel.hpp
+++ b/include/holoscan/core/messagelabel.hpp
@@ -62,16 +62,17 @@ struct OperatorTimestampLabel {
    * @brief Construct a new OperatorTimestampLabel object from an Operator pointer with a receive
    * timestamp equal to the current time and publish timestamp equal to -1.
    *
-   * @param op The pointer to the operator for which the timestamp label is created.
+   * @param op_name The fully qualified name of the operator for which the timestamp label is
+   * created.
    */
-  explicit OperatorTimestampLabel(Operator* op)
-      : operator_ptr(op), rec_timestamp(get_current_time_us()), pub_timestamp(-1) {}
+  explicit OperatorTimestampLabel(const std::string& op_name)
+      : operator_name(op_name), rec_timestamp(get_current_time_us()), pub_timestamp(-1) {}
 
-  OperatorTimestampLabel(Operator* op, int64_t rec_t, int64_t pub_t)
-      : operator_ptr(op), rec_timestamp(rec_t), pub_timestamp(pub_t) {}
+  OperatorTimestampLabel(const std::string& op_name, int64_t rec_t, int64_t pub_t)
+      : operator_name(op_name), rec_timestamp(rec_t), pub_timestamp(pub_t) {}
 
   OperatorTimestampLabel(const OperatorTimestampLabel& o)
-      : operator_ptr(o.operator_ptr),
+      : operator_name(o.operator_name),
         rec_timestamp(o.rec_timestamp),
         pub_timestamp(o.pub_timestamp) {}
 
@@ -79,7 +80,8 @@ struct OperatorTimestampLabel {
 
   void set_pub_timestamp_to_current() { pub_timestamp = get_current_time_us(); }
 
-  Operator* operator_ptr = nullptr;
+  // Operator* operator_ptr = nullptr;
+  std::string operator_name = "";
 
   // The timestamp when an Operator receives from an input
   // For a root Operator, it is the start of the compute call
@@ -105,11 +107,26 @@ class MessageLabel {
   MessageLabel() {
     // By default, allocate DEFAULT_NUM_PATHS paths in the message_paths
     message_paths.reserve(DEFAULT_NUM_PATHS);
+    message_path_operators.reserve(DEFAULT_NUM_PATHS);
   }
 
   MessageLabel(const MessageLabel& m)
       : message_paths(m.message_paths), message_path_operators(m.message_path_operators) {}
 
+  /**
+   * @brief Construct a new Message Label object from a vector of TimestampedPaths. This constructor
+   * automatically fills the message_path_operators vector from the argument of the constructor.
+   *
+   * @param m_paths The vector of TimestampedPaths to create the MessageLabel from.
+   */
+  explicit MessageLabel(const std::vector<TimestampedPath> m_paths) : message_paths(m_paths) {
+    for (auto& path : m_paths) {
+      PathOperators new_path_operators;
+      for (auto& op : path) { new_path_operators.insert(op.operator_name); }
+      message_path_operators.push_back(new_path_operators);
+    }
+  }
+
   MessageLabel& operator=(const MessageLabel& m) {
     if (this != &m) {
       this->message_paths = m.message_paths;
@@ -123,7 +140,7 @@ class MessageLabel {
    *
    * @return The number of paths in a MessageLabel.
    */
-  int num_paths() { return message_paths.size(); }
+  int num_paths() const { return message_paths.size(); }
 
   /**
    * @brief Get all the names of the path in formatted string, which is comma-separated values of
@@ -133,7 +150,7 @@ class MessageLabel {
    */
   std::vector<std::string> get_all_path_names();
 
-  std::vector<TimestampedPath> paths() { return message_paths; }
+  std::vector<TimestampedPath> paths() const { return message_paths; }
 
   /**
    * @brief Get the current end-to-end latency of a path in microseconds.
@@ -217,7 +234,7 @@ class MessageLabel {
    * @param op_name The name of the operator to check
    * @return List of path indexes where the operator is present
    */
-  std::vector<int> has_operator(std::string op_name);
+  std::vector<int> has_operator(const std::string& op_name) const;
 
   /**
    * @brief Add a new Operator timestamp to all the paths in a message label.
diff --git a/include/holoscan/core/network_context.hpp b/include/holoscan/core/network_context.hpp
index 71661800..6bf3b57e 100644
--- a/include/holoscan/core/network_context.hpp
+++ b/include/holoscan/core/network_context.hpp
@@ -54,7 +54,7 @@
  */
 #define HOLOSCAN_NETWORK_CONTEXT_FORWARD_ARGS(class_name) \
   HOLOSCAN_NETWORK_CONTEXT_FORWARD_TEMPLATE()             \
-  class_name(ArgT&& arg, ArgsT&&... args)                 \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)        \
       : NetworkContext(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 /**
@@ -71,12 +71,13 @@
  */
 #define HOLOSCAN_NETWORK_CONTEXT_FORWARD_ARGS_SUPER(class_name, super_class_name) \
   HOLOSCAN_NETWORK_CONTEXT_FORWARD_TEMPLATE()                                     \
-  class_name(ArgT&& arg, ArgsT&&... args)                                         \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)                                \
       : super_class_name(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 namespace holoscan {
 
-// TODO: NetworkContext is identical in implementation to Scheduler, so put the functionality in
+// TODO(unknown): NetworkContext is identical in implementation to Scheduler, so put the
+// functionality in
 //       a common base class.
 
 /**
@@ -221,7 +222,7 @@ class NetworkContext : public Component {
    *
    * @param spec The reference to the component specification.
    */
-  virtual void setup(ComponentSpec& spec) { (void)spec; }
+  virtual void setup([[maybe_unused]] ComponentSpec& spec) {}
 
   /**
    * @brief Initialize the network context.
diff --git a/include/holoscan/core/network_contexts/gxf/ucx_context.hpp b/include/holoscan/core/network_contexts/gxf/ucx_context.hpp
index 9e9c98aa..f1866cdb 100644
--- a/include/holoscan/core/network_contexts/gxf/ucx_context.hpp
+++ b/include/holoscan/core/network_contexts/gxf/ucx_context.hpp
@@ -52,6 +52,10 @@ class UcxContext : public gxf::GXFNetworkContext {
 
  private:
   Parameter<std::shared_ptr<UcxEntitySerializer>> entity_serializer_;
+  Parameter<bool> reconnect_;      ///< Try to reconnect if a connection is closed during run
+  Parameter<bool> cpu_data_only_;  ///< Support CPU memory only for UCX communication
+  Parameter<bool> enable_async_;   ///< Control whether UCX transmit/receive uses asynchronous mode
+
   // TODO: support GPUDevice nvidia::gxf::Resource
   // nvidia::gxf::Resource<nvidia::gxf::Handle<nvidia::gxf::GPUDevice>> gpu_device_;
 };
diff --git a/include/holoscan/core/operator.hpp b/include/holoscan/core/operator.hpp
index bf702623..edd63aba 100644
--- a/include/holoscan/core/operator.hpp
+++ b/include/holoscan/core/operator.hpp
@@ -36,6 +36,7 @@
 #include "./component.hpp"
 #include "./condition.hpp"
 #include "./forward_def.hpp"
+#include "./graph.hpp"
 #include "./messagelabel.hpp"
 #include "./metadata.hpp"
 #include "./operator_spec.hpp"
@@ -83,7 +84,7 @@
  */
 #define HOLOSCAN_OPERATOR_FORWARD_ARGS(class_name) \
   HOLOSCAN_OPERATOR_FORWARD_TEMPLATE()             \
-  class_name(ArgT&& arg, ArgsT&&... args)          \
+  explicit class_name(ArgT&& arg, ArgsT&&... args) \
       : Operator(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 /**
@@ -117,7 +118,7 @@
  */
 #define HOLOSCAN_OPERATOR_FORWARD_ARGS_SUPER(class_name, super_class_name) \
   HOLOSCAN_OPERATOR_FORWARD_TEMPLATE()                                     \
-  class_name(ArgT&& arg, ArgsT&&... args)                                  \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)                         \
       : super_class_name(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 namespace holoscan {
@@ -352,7 +353,7 @@ class Operator : public ComponentBase {
    *
    * @param spec The reference to the operator specification.
    */
-  virtual void setup(OperatorSpec& spec) { (void)spec; }
+  virtual void setup([[maybe_unused]] OperatorSpec& spec) {}
 
   /**
    * @brief Returns whether the operator is a root operator based on its fragment's graph
@@ -377,6 +378,32 @@ class Operator : public ComponentBase {
    */
   bool is_leaf();
 
+  /**
+   * @brief Returns whether all the successors of an operator are virtual operators
+   *
+   * @param op The shared_ptr to the operator for which the check is to be performed
+   * @param graph The graph of operators. fragment()->graph() can usually be used to get this graph.
+   * @return true if the operator has all virtual operator successors, false otherwise
+   */
+  static bool is_all_operator_successor_virtual(OperatorNodeType op, OperatorGraph& graph);
+
+  /**
+   * @brief Returns whether all the predecessors of an operator are virtual operators
+   *
+   * @param op The shared_ptr to the operator for which the check is to be performed
+   * @param graph The graph of operators. fragment()->graph() can usually be used to get this graph.
+   * @return true if the operator has all virtual operator predecessors, false otherwise
+   */
+  static bool is_all_operator_predecessor_virtual(OperatorNodeType op, OperatorGraph& graph);
+
+  /**
+   * @brief Returns the fully qualified name of the operator including the name of the fragment.
+   *
+   * @return std::string fully qualified name of the operator in the format:
+   * "<fragment_name>.<operator_name>"
+   */
+  std::string qualified_name();
+
   /**
    * @brief Initialize the operator.
    *
@@ -417,12 +444,9 @@ class Operator : public ComponentBase {
    * @param op_output The output context of the operator.
    * @param context The execution context of the operator.
    */
-  virtual void compute(InputContext& op_input, OutputContext& op_output,
-                       ExecutionContext& context) {
-    (void)op_input;
-    (void)op_output;
-    (void)context;
-  }
+  virtual void compute([[maybe_unused]] InputContext& op_input,
+                       [[maybe_unused]] OutputContext& op_output,
+                       [[maybe_unused]] ExecutionContext& context) {}
 
   /**
    * @brief Register the argument setter for the given type.
@@ -616,6 +640,8 @@ class Operator : public ComponentBase {
   // externally by them
   friend class AnnotatedDoubleBufferReceiver;
   friend class AnnotatedDoubleBufferTransmitter;
+  friend class HoloscanUcxTransmitter;
+  friend class HoloscanUcxReceiver;
   friend class DFFTCollector;
 
   // Make GXFExecutor a friend class so it can call protected initialization methods
@@ -623,6 +649,11 @@ class Operator : public ComponentBase {
   // Fragment should be able to call reset_graph_entities
   friend class Fragment;
 
+  friend gxf_result_t deannotate_message(gxf_uid_t* uid, const gxf_context_t& context, Operator* op,
+                                         const char* name);
+  friend gxf_result_t annotate_message(gxf_uid_t uid, const gxf_context_t& context, Operator* op,
+                                       const char* name);
+
   /**
    * @brief This function creates a GraphEntity corresponding to the operator
    * @param context The GXF context.
@@ -734,7 +765,6 @@ class Operator : public ComponentBase {
           // Note that the type of any_param is Parameter<typeT>*, not Parameter<typeT>.
           auto& param = *std::any_cast<Parameter<typeT>*>(any_param);
           const auto& arg_type = arg.arg_type();
-          (void)param;
 
           auto element_type = arg_type.element_type();
           auto container_type = arg_type.container_type();
diff --git a/include/holoscan/core/operator_spec.hpp b/include/holoscan/core/operator_spec.hpp
index 05041c59..60e9017a 100644
--- a/include/holoscan/core/operator_spec.hpp
+++ b/include/holoscan/core/operator_spec.hpp
@@ -73,7 +73,8 @@ class OperatorSpec : public ComponentSpec {
    *       (DoubleBufferReceiver/UcxReceiver) and condition (MessageAvailableCondition) will use
    *       the queue size for initialization ('capacity' for the connector and 'min_size' for
    *       the condition) if they are not set.
-   *       Please refer to the [Holoscan SDK User Guide](https://docs.nvidia.com/holoscan/sdk-user-guide/holoscan_create_operator.html#receiving-any-number-of-inputs-c)
+   *       Please refer to the [Holoscan SDK User
+   * Guide](https://docs.nvidia.com/holoscan/sdk-user-guide/holoscan_create_operator.html#receiving-any-number-of-inputs-c)
    *       to see how to receive any number of inputs in C++.
    *
    * @tparam DataT The type of the input data.
@@ -175,8 +176,7 @@ class OperatorSpec : public ComponentSpec {
    * @param init_list The initializer list of the parameter.
    */
   void param(Parameter<holoscan::IOSpec*>& parameter, const char* key, const char* headline,
-             const char* description, std::initializer_list<void*> init_list) {
-    (void)init_list;
+             const char* description, [[maybe_unused]] std::initializer_list<void*> init_list) {
     parameter.key_ = key;
     parameter.headline_ = headline;
     parameter.description_ = description;
diff --git a/include/holoscan/core/resource.hpp b/include/holoscan/core/resource.hpp
index bc5f8c55..0a591885 100644
--- a/include/holoscan/core/resource.hpp
+++ b/include/holoscan/core/resource.hpp
@@ -47,7 +47,7 @@
  */
 #define HOLOSCAN_RESOURCE_FORWARD_ARGS(class_name) \
   HOLOSCAN_RESOURCE_FORWARD_TEMPLATE()             \
-  class_name(ArgT&& arg, ArgsT&&... args)          \
+  explicit class_name(ArgT&& arg, ArgsT&&... args) \
       : Resource(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 /**
@@ -78,7 +78,7 @@
  */
 #define HOLOSCAN_RESOURCE_FORWARD_ARGS_SUPER(class_name, super_class_name) \
   HOLOSCAN_RESOURCE_FORWARD_TEMPLATE()                                     \
-  class_name(ArgT&& arg, ArgsT&&... args)                                  \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)                         \
       : super_class_name(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 namespace holoscan {
@@ -196,7 +196,7 @@ class Resource : public Component {
    *
    * @param spec The reference to the component specification.
    */
-  virtual void setup(ComponentSpec& spec) { (void)spec; }
+  virtual void setup([[maybe_unused]] ComponentSpec& spec) {}
 
   void initialize() override;
 
diff --git a/include/holoscan/core/resources/gxf/annotated_double_buffer_transmitter.hpp b/include/holoscan/core/resources/gxf/annotated_double_buffer_transmitter.hpp
index d82dc12c..ebf1716d 100644
--- a/include/holoscan/core/resources/gxf/annotated_double_buffer_transmitter.hpp
+++ b/include/holoscan/core/resources/gxf/annotated_double_buffer_transmitter.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,7 +24,7 @@
 #include <gxf/core/entity.hpp>
 #include <gxf/core/handle.hpp>
 
-#include "holoscan/core/resources/gxf/double_buffer_transmitter.hpp"
+#include "gxf/std/double_buffer_transmitter.hpp"
 
 namespace holoscan {
 
@@ -64,6 +64,7 @@ class AnnotatedDoubleBufferTransmitter : public nvidia::gxf::DoubleBufferTransmi
 
   /// The concatenated name of the operator and this transmitter.
   std::string op_transmitter_name_pair_;
+  int is_op_root_ = -1;  ///< Indicates whether associated operator is a root operator.
 };
 
 }  // namespace holoscan
diff --git a/include/holoscan/core/resources/gxf/cuda_allocator.hpp b/include/holoscan/core/resources/gxf/cuda_allocator.hpp
new file mode 100644
index 00000000..41796e69
--- /dev/null
+++ b/include/holoscan/core/resources/gxf/cuda_allocator.hpp
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_RESOURCES_GXF_CUDA_ALLOCATOR_HPP
+#define HOLOSCAN_CORE_RESOURCES_GXF_CUDA_ALLOCATOR_HPP
+
+#include <cstdint>
+#include <string>
+
+#include <gxf/cuda/cuda_allocator.hpp>
+
+#include "../../gxf/gxf_resource.hpp"
+#include "./allocator.hpp"
+
+namespace holoscan {
+
+/**
+ * @brief Base class for all CUDA allocators.
+ *
+ * CudaAllocators are allocators for CUDA memory that support asynchronous allocation.
+ */
+class CudaAllocator : public Allocator {
+ public:
+  HOLOSCAN_RESOURCE_FORWARD_ARGS_SUPER(CudaAllocator, Allocator)
+
+  CudaAllocator() = default;
+  CudaAllocator(const std::string& name, nvidia::gxf::CudaAllocator* component);
+
+  const char* gxf_typename() const override { return "nvidia::gxf::CudaAllocator"; }
+
+  // the following async functions and get_pool_size are specific to CudaAllocator
+  nvidia::byte* allocate_async(uint64_t size, cudaStream_t stream);
+  void free_async(byte* pointer, cudaStream_t stream);
+  size_t pool_size(MemoryStorageType type) const;
+
+  nvidia::gxf::CudaAllocator* get() const;
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_RESOURCES_GXF_CUDA_ALLOCATOR_HPP */
diff --git a/include/holoscan/core/resources/gxf/holoscan_ucx_receiver.hpp b/include/holoscan/core/resources/gxf/holoscan_ucx_receiver.hpp
new file mode 100644
index 00000000..c035fe1b
--- /dev/null
+++ b/include/holoscan/core/resources/gxf/holoscan_ucx_receiver.hpp
@@ -0,0 +1,67 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_RESOURCES_GXF_HOLOSCAN_UCX_RECEIVER_HPP
+#define HOLOSCAN_CORE_RESOURCES_GXF_HOLOSCAN_UCX_RECEIVER_HPP
+
+#include <gxf/ucx/ucx_receiver.hpp>
+
+#include <gxf/core/component.hpp>
+#include <gxf/core/entity.hpp>
+#include <gxf/core/handle.hpp>
+
+namespace holoscan {
+
+// Forward declarations
+class Operator;
+
+/**
+ * @brief HoloscanUcxReceiver class optionally tracks every received message with a MessageLabel
+ * timestamp if data flow tracking is enabled
+ *
+ */
+class HoloscanUcxReceiver : public nvidia::gxf::UcxReceiver {
+ public:
+  HoloscanUcxReceiver() = default;
+
+  /**
+   * @brief This function overrides the UcxReceiver::receive_abi() function. It first calls
+   * the base class' receive_abi() function and extracts the MessageLabel from the received message.
+   * It then adds a new OperatorTimestampLabel to the MessageLabel and updates the Operator's input
+   * message label.
+   */
+  gxf_result_t receive_abi(gxf_uid_t* uid);
+
+  holoscan::Operator* op() { return op_; }
+
+  /**
+   * @brief Sets the associated operator for this HoloscanUcxReceiver. It is set at
+   * the @see create_input_port() function.
+   *
+   * @param op The operator that this receiver is attached to.
+   */
+  void op(holoscan::Operator* op) { this->op_ = op; }
+  void track() { tracking_ = true; }
+
+ private:
+  holoscan::Operator* op_ = nullptr;  ///< The operator that this receiver is attached to.
+  bool tracking_ = false;             ///< Used to decide whether to use data flow tracking or not.
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_RESOURCES_GXF_HOLOSCAN_UCX_RECEIVER_HPP */
diff --git a/include/holoscan/core/resources/gxf/holoscan_ucx_transmitter.hpp b/include/holoscan/core/resources/gxf/holoscan_ucx_transmitter.hpp
new file mode 100644
index 00000000..d1ee72f2
--- /dev/null
+++ b/include/holoscan/core/resources/gxf/holoscan_ucx_transmitter.hpp
@@ -0,0 +1,72 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_RESOURCES_GXF_HOLOSCAN_UCX_TRANSMITTER_HPP
+#define HOLOSCAN_CORE_RESOURCES_GXF_HOLOSCAN_UCX_TRANSMITTER_HPP
+
+#include <string>
+
+#include <gxf/ucx/ucx_transmitter.hpp>
+
+#include <gxf/core/component.hpp>
+#include <gxf/core/entity.hpp>
+#include <gxf/core/handle.hpp>
+
+namespace holoscan {
+
+// Forward declarations
+class Operator;
+
+/**
+ * @brief HoloscanUcxTransmitter class optionally adds a MessageLabel timestamp to every published
+ * message if data flow tracking is enabled
+ *
+ */
+class HoloscanUcxTransmitter : public nvidia::gxf::UcxTransmitter {
+ public:
+  HoloscanUcxTransmitter() = default;
+
+  /**
+   * @brief This function overrides the UcxTransmitter::publish_abi() function. It first calls
+   * annotates the message with a MessageLabel timestamp if data flow tracking is enabled. It then
+   * calls the base class' publish_abi() function. Finally, if data flow tracking is enabled, it
+   * updates the Operator's number of published messages.
+   */
+  gxf_result_t publish_abi(gxf_uid_t uid);
+
+  holoscan::Operator* op() { return op_; }
+
+  /**
+   * @brief Sets the associated operator for this HoloscanUcxTransmitter. It is set at
+   * the @see create_input_port() function.
+   *
+   * @param op The operator that this transmitter is attached to.
+   */
+  void op(holoscan::Operator* op) { this->op_ = op; }
+  void track() { tracking_ = true; }
+
+ private:
+  holoscan::Operator* op_ = nullptr;  ///< The operator that this transmitter is attached to.
+  bool tracking_ = false;             ///< Used to decide whether to use data flow tracking or not.
+  /// The concatenated name of the operator and this transmitter.
+  std::string op_transmitter_name_pair_;
+  int is_op_root = -1;  ///< Indicates whether associated op is a root operator.
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_RESOURCES_GXF_HOLOSCAN_UCX_TRANSMITTER_HPP */
diff --git a/include/holoscan/core/resources/gxf/rmm_allocator.hpp b/include/holoscan/core/resources/gxf/rmm_allocator.hpp
new file mode 100644
index 00000000..cf2d7e59
--- /dev/null
+++ b/include/holoscan/core/resources/gxf/rmm_allocator.hpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_RESOURCES_GXF_RMM_ALLOCATOR_HPP
+#define HOLOSCAN_CORE_RESOURCES_GXF_RMM_ALLOCATOR_HPP
+
+#include <cstdint>
+#include <string>
+
+#include "gxf/rmm/rmm_allocator.hpp"
+#include "gxf/std/allocator.hpp"
+
+#include "./cuda_allocator.hpp"
+
+namespace holoscan {
+
+/**
+ * @brief RMM (RAPIDS memory manager) allocator.
+ *
+ * This is a memory pool which provides a user-specified number of equally sized blocks of memory.
+ */
+class RMMAllocator : public CudaAllocator {
+ public:
+  HOLOSCAN_RESOURCE_FORWARD_ARGS_SUPER(RMMAllocator, CudaAllocator)
+
+  RMMAllocator() = default;
+  RMMAllocator(const std::string& device_memory_initial_size,
+               const std::string& device_memory_max_size,
+               const std::string& host_memory_initial_size, const std::string& host_memory_max_size,
+               int32_t dev_id = 0)
+      : device_memory_initial_size_(device_memory_initial_size),
+        device_memory_max_size_(device_memory_max_size),
+        host_memory_initial_size_(host_memory_initial_size),
+        host_memory_max_size_(host_memory_max_size),
+        dev_id_(dev_id) {}
+  RMMAllocator(const std::string& name, nvidia::gxf::RMMAllocator* component);
+
+  const char* gxf_typename() const override { return "nvidia::gxf::RMMAllocator"; }
+
+  void setup(ComponentSpec& spec) override;
+
+  nvidia::gxf::RMMAllocator* get() const;
+
+  // pool_size method implemented on the parent CudaAllocator class
+
+ private:
+  Parameter<std::string> device_memory_initial_size_;
+  Parameter<std::string> device_memory_max_size_;
+  Parameter<std::string> host_memory_initial_size_;
+  Parameter<std::string> host_memory_max_size_;
+  Parameter<int32_t> dev_id_;
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_RESOURCES_GXF_RMM_ALLOCATOR_HPP */
diff --git a/include/holoscan/core/resources/gxf/std_component_serializer.hpp b/include/holoscan/core/resources/gxf/std_component_serializer.hpp
index 72a3cf5a..d2d6c404 100644
--- a/include/holoscan/core/resources/gxf/std_component_serializer.hpp
+++ b/include/holoscan/core/resources/gxf/std_component_serializer.hpp
@@ -21,7 +21,7 @@
 #include <memory>
 #include <vector>
 
-// TODO: provide get() method once upstream issue with missing GXF header is resolved
+// TODO(unknown): provide get() method once upstream issue with missing GXF header is resolved
 // #include <gxf/serialization/std_component_serializer.hpp>
 
 #include "../../gxf/gxf_resource.hpp"
diff --git a/include/holoscan/core/resources/gxf/stream_ordered_allocator.hpp b/include/holoscan/core/resources/gxf/stream_ordered_allocator.hpp
new file mode 100644
index 00000000..a0bbcb70
--- /dev/null
+++ b/include/holoscan/core/resources/gxf/stream_ordered_allocator.hpp
@@ -0,0 +1,58 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOLOSCAN_CORE_RESOURCES_GXF_STREAM_ORDERED_ALLOCATOR_HPP
+#define HOLOSCAN_CORE_RESOURCES_GXF_STREAM_ORDERED_ALLOCATOR_HPP
+
+#include <string>
+
+#include <gxf/cuda/stream_ordered_allocator.hpp>
+
+#include "../../gxf/gxf_resource.hpp"
+#include "./cuda_allocator.hpp"
+
+namespace holoscan {
+
+/**
+ * @brief
+ *
+ * StreamOrderedAllocator uses cudaMallocFromPoolAsync / cudaFreeAsync dynamically without a pool.
+ * Does not provide bounded execution times.
+ */
+class StreamOrderedAllocator : public CudaAllocator {
+ public:
+  HOLOSCAN_RESOURCE_FORWARD_ARGS_SUPER(StreamOrderedAllocator, CudaAllocator)
+
+  StreamOrderedAllocator() = default;
+  StreamOrderedAllocator(const std::string& name, nvidia::gxf::StreamOrderedAllocator* component);
+
+  const char* gxf_typename() const override { return "nvidia::gxf::StreamOrderedAllocator"; }
+
+  void setup(ComponentSpec& spec) override;
+
+  nvidia::gxf::StreamOrderedAllocator* get() const;
+
+ private:
+  Parameter<std::string> release_threshold_;
+  Parameter<std::string> device_memory_initial_size_;
+  Parameter<std::string> device_memory_max_size_;
+  Parameter<int32_t> dev_id_;
+};
+
+}  // namespace holoscan
+
+#endif /* HOLOSCAN_CORE_RESOURCES_GXF_STREAM_ORDERED_ALLOCATOR_HPP */
diff --git a/include/holoscan/core/resources/gxf/ucx_receiver.hpp b/include/holoscan/core/resources/gxf/ucx_receiver.hpp
index c0672239..49a48ac0 100644
--- a/include/holoscan/core/resources/gxf/ucx_receiver.hpp
+++ b/include/holoscan/core/resources/gxf/ucx_receiver.hpp
@@ -42,7 +42,7 @@ class UcxReceiver : public Receiver {
   UcxReceiver() = default;
   UcxReceiver(const std::string& name, nvidia::gxf::Receiver* component);
 
-  const char* gxf_typename() const override { return "nvidia::gxf::UcxReceiver"; }
+  const char* gxf_typename() const override { return "holoscan::HoloscanUcxReceiver"; }
 
   void setup(ComponentSpec& spec) override;
   void initialize() override;
@@ -58,12 +58,15 @@ class UcxReceiver : public Receiver {
 
   nvidia::gxf::UcxReceiver* get() const;
 
+  /// @brief Enable tracking in the underlying holoscan::HoloscanUcxReceiver class
+  void track();
+
  private:
   Parameter<std::string> address_;
   Parameter<uint32_t> port_;
   Parameter<std::shared_ptr<holoscan::UcxSerializationBuffer>> buffer_;
-  // TODO: support GPUDevice nvidia::gxf::Resource
-  // nvidia::gxf::Resource<nvidia::gxf::Handle<nvidia::gxf::GPUDevice>> gpu_device_;
+  // TODO(unknown): support GPUDevice nvidia::gxf::Resource
+  // // nvidia::gxf::Resource<nvidia::gxf::Handle<nvidia::gxf::GPUDevice>> gpu_device_;
 };
 
 }  // namespace holoscan
diff --git a/include/holoscan/core/resources/gxf/ucx_transmitter.hpp b/include/holoscan/core/resources/gxf/ucx_transmitter.hpp
index f3b62c60..83583b34 100644
--- a/include/holoscan/core/resources/gxf/ucx_transmitter.hpp
+++ b/include/holoscan/core/resources/gxf/ucx_transmitter.hpp
@@ -48,7 +48,7 @@ class UcxTransmitter : public Transmitter {
   UcxTransmitter() = default;
   UcxTransmitter(const std::string& name, nvidia::gxf::Transmitter* component);
 
-  const char* gxf_typename() const override { return "nvidia::gxf::UcxTransmitter"; }
+  const char* gxf_typename() const override { return "holoscan::HoloscanUcxTransmitter"; }
 
   void setup(ComponentSpec& spec) override;
   void initialize() override;
@@ -70,6 +70,9 @@ class UcxTransmitter : public Transmitter {
 
   nvidia::gxf::UcxTransmitter* get() const;
 
+  /// @brief Enable tracking in the underlying holoscan::HoloscanUcxTransmitter class
+  void track();
+
  private:
   Parameter<std::string> receiver_address_;
   Parameter<std::string> local_address_;
@@ -77,7 +80,7 @@ class UcxTransmitter : public Transmitter {
   Parameter<uint32_t> local_port_;
   Parameter<uint32_t> maximum_connection_retries_;
   Parameter<std::shared_ptr<holoscan::UcxSerializationBuffer>> buffer_;
-  // TODO: support GPUDevice nvidia::gxf::Resource
+  // TODO(unknown): support GPUDevice nvidia::gxf::Resource
   // nvidia::gxf::Resource<nvidia::gxf::Handle<nvidia::gxf::GPUDevice>> gpu_device_;
 };
 
diff --git a/include/holoscan/core/scheduler.hpp b/include/holoscan/core/scheduler.hpp
index a6baa865..0860868b 100644
--- a/include/holoscan/core/scheduler.hpp
+++ b/include/holoscan/core/scheduler.hpp
@@ -54,7 +54,7 @@
  */
 #define HOLOSCAN_SCHEDULER_FORWARD_ARGS(class_name) \
   HOLOSCAN_SCHEDULER_FORWARD_TEMPLATE()             \
-  class_name(ArgT&& arg, ArgsT&&... args)           \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)  \
       : Scheduler(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 /**
@@ -71,7 +71,7 @@
  */
 #define HOLOSCAN_SCHEDULER_FORWARD_ARGS_SUPER(class_name, super_class_name) \
   HOLOSCAN_SCHEDULER_FORWARD_TEMPLATE()                                     \
-  class_name(ArgT&& arg, ArgsT&&... args)                                   \
+  explicit class_name(ArgT&& arg, ArgsT&&... args)                          \
       : super_class_name(std::forward<ArgT>(arg), std::forward<ArgsT>(args)...) {}
 
 namespace holoscan {
@@ -222,7 +222,7 @@ class Scheduler : public Component {
    *
    * @param spec The reference to the component specification.
    */
-  virtual void setup(ComponentSpec& spec) { (void)spec; }
+  virtual void setup([[maybe_unused]] ComponentSpec& spec) {}
 
   /**
    * @brief Initialize the scheduler.
diff --git a/include/holoscan/core/system/cpu_info.hpp b/include/holoscan/core/system/cpu_info.hpp
index aab12977..67898f9e 100644
--- a/include/holoscan/core/system/cpu_info.hpp
+++ b/include/holoscan/core/system/cpu_info.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,15 +46,15 @@ struct CPUInfo {
   int32_t num_cores = 0;                 ///< The number of cores
   int32_t num_cpus = 0;                  ///< The number of CPUs
   int32_t num_processors = 0;            ///< The number of available processors
-  float cpu_usage = 0.0f;                ///< The CPU usage (in percent)
+  float cpu_usage = 0.0F;                ///< The CPU usage (in percent)
   uint64_t memory_total = 0;             ///< The total memory (in bytes)
   uint64_t memory_free = 0;              ///< The free memory (in bytes)
   uint64_t memory_available = 0;         ///< The available memory (in bytes)
-  float memory_usage = 0.0f;             ///< The memory usage (in percent)
+  float memory_usage = 0.0F;             ///< The memory usage (in percent)
   uint64_t shared_memory_total = 0;      ///< The total shared memory (in bytes)
   uint64_t shared_memory_free = 0;       ///< The free shared memory (in bytes)
   uint64_t shared_memory_available = 0;  ///< The available shared memory (in bytes)
-  float shared_memory_usage = 0.0f;      ///< The shared memory usage (in percent)
+  float shared_memory_usage = 0.0F;      ///< The shared memory usage (in percent)
 };
 }  // namespace holoscan
 
diff --git a/include/holoscan/core/system/gpu_info.hpp b/include/holoscan/core/system/gpu_info.hpp
index eedd10a2..eb1b1015 100644
--- a/include/holoscan/core/system/gpu_info.hpp
+++ b/include/holoscan/core/system/gpu_info.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -59,7 +59,7 @@ struct GPUInfo {
   uint64_t memory_total = 0;  ///< The total memory (in bytes)
   uint64_t memory_free = 0;   ///< The free memory (in bytes)
   uint64_t memory_used = 0;   ///< The used memory (in bytes)
-  float memory_usage = 0.0f;  ///< The memory usage (in percent)
+  float memory_usage = 0.0F;  ///< The memory usage (in percent)
   uint32_t power_limit = 0;   ///< The power limit (in milliwatts)
   uint32_t power_usage = 0;   ///< The power usage (in milliwatts)
   uint32_t temperature = 0;   ///< The temperature (in degrees Celsius)
diff --git a/include/holoscan/holoscan.hpp b/include/holoscan/holoscan.hpp
index 5c7b60d7..523f4033 100644
--- a/include/holoscan/holoscan.hpp
+++ b/include/holoscan/holoscan.hpp
@@ -44,9 +44,13 @@
 #include "./core/conditions/gxf/asynchronous.hpp"
 #include "./core/conditions/gxf/boolean.hpp"
 #include "./core/conditions/gxf/count.hpp"
+#include "./core/conditions/gxf/cuda_buffer_available.hpp"
+#include "./core/conditions/gxf/cuda_event.hpp"
+#include "./core/conditions/gxf/cuda_stream.hpp"
 #include "./core/conditions/gxf/downstream_affordable.hpp"
-#include "./core/conditions/gxf/periodic.hpp"
+#include "./core/conditions/gxf/expiring_message.hpp"
 #include "./core/conditions/gxf/message_available.hpp"
+#include "./core/conditions/gxf/periodic.hpp"
 
 // NetworkContexts
 #include "./core/network_contexts/gxf/ucx_context.hpp"
@@ -60,9 +64,11 @@
 #include "./core/resources/gxf/gxf_component_resource.hpp"
 #include "./core/resources/gxf/manual_clock.hpp"
 #include "./core/resources/gxf/realtime_clock.hpp"
+#include "./core/resources/gxf/rmm_allocator.hpp"
 #include "./core/resources/gxf/serialization_buffer.hpp"
 #include "./core/resources/gxf/std_component_serializer.hpp"
 #include "./core/resources/gxf/std_entity_serializer.hpp"
+#include "./core/resources/gxf/stream_ordered_allocator.hpp"
 #include "./core/resources/gxf/ucx_component_serializer.hpp"
 #include "./core/resources/gxf/ucx_entity_serializer.hpp"
 #include "./core/resources/gxf/ucx_holoscan_component_serializer.hpp"
diff --git a/include/holoscan/operators/async_ping_rx/async_ping_rx.hpp b/include/holoscan/operators/async_ping_rx/async_ping_rx.hpp
index 44cd78fd..0fb30e21 100644
--- a/include/holoscan/operators/async_ping_rx/async_ping_rx.hpp
+++ b/include/holoscan/operators/async_ping_rx/async_ping_rx.hpp
@@ -50,7 +50,8 @@ class AsyncPingRxOp : public Operator {
   void setup(OperatorSpec& spec) override;
   void initialize() override;
   void start() override;
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override;
+  void compute([[maybe_unused]] InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
   void stop() override;
 
   void async_ping();
diff --git a/include/holoscan/operators/async_ping_tx/async_ping_tx.hpp b/include/holoscan/operators/async_ping_tx/async_ping_tx.hpp
index b4e70b0c..a1774a2b 100644
--- a/include/holoscan/operators/async_ping_tx/async_ping_tx.hpp
+++ b/include/holoscan/operators/async_ping_tx/async_ping_tx.hpp
@@ -52,7 +52,8 @@ class AsyncPingTxOp : public Operator {
   void setup(OperatorSpec& spec) override;
   void initialize() override;
   void start() override;
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override;
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
   void stop() override;
 
   void async_ping();
diff --git a/include/holoscan/operators/holoviz/holoviz.hpp b/include/holoscan/operators/holoviz/holoviz.hpp
index a8459b39..8aa9738b 100644
--- a/include/holoscan/operators/holoviz/holoviz.hpp
+++ b/include/holoscan/operators/holoviz/holoviz.hpp
@@ -167,7 +167,7 @@ struct BufferInfo;
  *     - **image_format**: color image format, used if `type` is `color`, `color_lut` or
  *         `depth_map_color`. (default: `auto_detect`).
  *       - type: `std::string`
- *     - **color**: RGBA color of rendered geometry (default: `[1.f, 1.f, 1.f, 1.f]`)
+ *     - **color**: RGBA color of rendered geometry (default: `[1.F, 1.F, 1.F, 1.F]`)
  *       - type: `std::vector<float>`
  *     - **line_width**: line width for geometry made of lines (default: `1.0`)
  *       - type: `float`
@@ -749,7 +749,7 @@ class HolovizOp : public Operator {
 
     std::string tensor_name_;  ///< name of the tensor/video buffer containing the input data
     InputType type_ = InputType::UNKNOWN;  ///< input type
-    float opacity_ = 1.f;  ///< layer opacity, 1.0 is fully opaque, 0.0 is fully transparent
+    float opacity_ = 1.F;  ///< layer opacity, 1.0 is fully opaque, 0.0 is fully transparent
     int32_t priority_ =
         0;  ///< layer priority, determines the render order, layers with higher priority values are
             ///< rendered on top of layers with lower priority values
@@ -763,9 +763,9 @@ class HolovizOp : public Operator {
     ChromaLocation y_chroma_location_ =
         ChromaLocation::COSITED_EVEN;  ///< chroma location in y direction for formats which are
                                        ///< chroma downsampled in height (420)
-    std::vector<float> color_{1.f, 1.f, 1.f, 1.f};  ///< color of rendered geometry
-    float line_width_ = 1.f;                        ///< line width for geometry made of lines
-    float point_size_ = 1.f;                        ///< point size for geometry made of points
+    std::vector<float> color_{1.F, 1.F, 1.F, 1.F};  ///< color of rendered geometry
+    float line_width_ = 1.F;                        ///< line width for geometry made of lines
+    float point_size_ = 1.F;                        ///< point size for geometry made of points
     std::vector<std::string> text_;  ///< array of text strings, used when type_ is TEXT.
     DepthMapRenderMode depth_map_render_mode_ =
         DepthMapRenderMode::POINTS;  ///< depth map render mode, used if type_ is
@@ -790,12 +790,12 @@ class HolovizOp : public Operator {
      * viewport instead of the upper left corner.
      */
     struct View {
-      float offset_x_ = 0.f,
-            offset_y_ = 0.f;  ///< offset of top-left corner of the view. Top left coordinate of
+      float offset_x_ = 0.F,
+            offset_y_ = 0.F;  ///< offset of top-left corner of the view. Top left coordinate of
                               /// the window area is (0, 0) bottom right
                               /// coordinate is (1, 1).
-      float width_ = 1.f,
-            height_ = 1.f;  ///< width and height of the view in normalized range. 1.0 is full size.
+      float width_ = 1.F,
+            height_ = 1.F;  ///< width and height of the view in normalized range. 1.0 is full size.
       std::optional<std::array<float, 16>>
           matrix_;  ///< row major 4x4 transform matrix (optional, can be nullptr)
     };
diff --git a/include/holoscan/operators/inference/inference.hpp b/include/holoscan/operators/inference/inference.hpp
index 5c3ec9f5..f72c66df 100644
--- a/include/holoscan/operators/inference/inference.hpp
+++ b/include/holoscan/operators/inference/inference.hpp
@@ -162,6 +162,9 @@ class InferenceOp : public holoscan::Operator {
   ///  @brief Output tensor names
   Parameter<std::vector<std::string>> out_tensor_names_;
 
+  /// @brief Optimization profile for models with dynamic input shapes
+  Parameter<std::vector<int32_t>> trt_opt_profile_;
+
   ///  @brief Memory allocator
   Parameter<std::shared_ptr<Allocator>> allocator_;
 
diff --git a/include/holoscan/operators/inference_processor/inference_processor.hpp b/include/holoscan/operators/inference_processor/inference_processor.hpp
index 02be1f75..cd6c5832 100644
--- a/include/holoscan/operators/inference_processor/inference_processor.hpp
+++ b/include/holoscan/operators/inference_processor/inference_processor.hpp
@@ -89,6 +89,7 @@ class InferenceProcessorOp : public holoscan::Operator {
   void setup(OperatorSpec& spec) override;
   void initialize() override;
   void start() override;
+  void stop() override;
   void compute(InputContext& op_input, OutputContext& op_output,
                ExecutionContext& context) override;
 
diff --git a/include/holoscan/operators/ping_rx/ping_rx.hpp b/include/holoscan/operators/ping_rx/ping_rx.hpp
index 68ad8aea..66c6e48e 100644
--- a/include/holoscan/operators/ping_rx/ping_rx.hpp
+++ b/include/holoscan/operators/ping_rx/ping_rx.hpp
@@ -41,7 +41,8 @@ class PingRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 };
 
 }  // namespace holoscan::ops
diff --git a/include/holoscan/operators/ping_tensor_rx/ping_tensor_rx.hpp b/include/holoscan/operators/ping_tensor_rx/ping_tensor_rx.hpp
index 9fe7bbc8..e8573dd5 100644
--- a/include/holoscan/operators/ping_tensor_rx/ping_tensor_rx.hpp
+++ b/include/holoscan/operators/ping_tensor_rx/ping_tensor_rx.hpp
@@ -48,7 +48,8 @@ class PingTensorRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 
  private:
   size_t count_ = 1;
diff --git a/include/holoscan/operators/ping_tensor_tx/ping_tensor_tx.hpp b/include/holoscan/operators/ping_tensor_tx/ping_tensor_tx.hpp
index d34e80c9..4bb3b781 100644
--- a/include/holoscan/operators/ping_tensor_tx/ping_tensor_tx.hpp
+++ b/include/holoscan/operators/ping_tensor_tx/ping_tensor_tx.hpp
@@ -74,7 +74,8 @@ class PingTensorTxOp : public Operator {
 
   void initialize() override;
   void setup(OperatorSpec& spec) override;
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override;
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 
   nvidia::gxf::PrimitiveType element_type() {
     if (element_type_.has_value()) { return element_type_.value(); }
diff --git a/include/holoscan/operators/ping_tx/ping_tx.hpp b/include/holoscan/operators/ping_tx/ping_tx.hpp
index 1a5e652e..f7c78724 100644
--- a/include/holoscan/operators/ping_tx/ping_tx.hpp
+++ b/include/holoscan/operators/ping_tx/ping_tx.hpp
@@ -40,7 +40,8 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override;
+  void compute(InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override;
 
   int index() const { return index_; }
 
diff --git a/include/holoscan/operators/v4l2_video_capture/v4l2_video_capture.hpp b/include/holoscan/operators/v4l2_video_capture/v4l2_video_capture.hpp
index c47314e1..63d5778c 100644
--- a/include/holoscan/operators/v4l2_video_capture/v4l2_video_capture.hpp
+++ b/include/holoscan/operators/v4l2_video_capture/v4l2_video_capture.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef HOLOSCAN_OPERATORS_V4L2_VIDEO_CAPTURE_HPP
-#define HOLOSCAN_OPERATORS_V4L2_VIDEO_CAPTURE_HPP
+#ifndef HOLOSCAN_OPERATORS_V4L2_VIDEO_CAPTURE_V4L2_VIDEO_CAPTURE_HPP
+#define HOLOSCAN_OPERATORS_V4L2_VIDEO_CAPTURE_V4L2_VIDEO_CAPTURE_HPP
 
 #include <linux/videodev2.h>
 #include <memory>
@@ -103,7 +103,7 @@ class V4L2VideoCaptureOp : public Operator {
   void setup(OperatorSpec& spec) override;
   void start() override;
   void initialize() override;
-  void compute(InputContext& op_input, OutputContext& op_output,
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
                ExecutionContext& context) override;
   void stop() override;
 
@@ -148,4 +148,4 @@ class V4L2VideoCaptureOp : public Operator {
 
 }  // namespace holoscan::ops
 
-#endif /* HOLOSCAN_OPERATORS_V4L2_VIDEO_CAPTURE_HPP */
+#endif /* HOLOSCAN_OPERATORS_V4L2_VIDEO_CAPTURE_V4L2_VIDEO_CAPTURE_HPP */
diff --git a/include/holoscan/operators/video_stream_recorder/video_stream_recorder.hpp b/include/holoscan/operators/video_stream_recorder/video_stream_recorder.hpp
index b7e5d220..c3928465 100644
--- a/include/holoscan/operators/video_stream_recorder/video_stream_recorder.hpp
+++ b/include/holoscan/operators/video_stream_recorder/video_stream_recorder.hpp
@@ -59,7 +59,7 @@ class VideoStreamRecorderOp : public holoscan::Operator {
 
   void initialize() override;
   // void deinitialize() override;
-  void compute(InputContext& op_input, OutputContext& op_output,
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
                ExecutionContext& context) override;
   void stop() override;
 
diff --git a/include/holoscan/operators/video_stream_replayer/video_stream_replayer.hpp b/include/holoscan/operators/video_stream_replayer/video_stream_replayer.hpp
index fda43ec3..b7d45e4c 100644
--- a/include/holoscan/operators/video_stream_replayer/video_stream_replayer.hpp
+++ b/include/holoscan/operators/video_stream_replayer/video_stream_replayer.hpp
@@ -61,6 +61,27 @@ namespace holoscan::ops {
  * - **entity_serializer**: The entity serializer used for deserialization. The default is to use
  *   a default-initialized ``holoscan::gxzf::StdEntitySerializer``. If this argument is
  *   specified, then the `allocator` argument is ignored.
+ *
+ * ==Device Memory Requirements==
+ *
+ * This operator reads data from a file to an intermediate host buffer and then transfers the data
+ * to the GPU. Because both host and device memory is needed, an allocator supporting both memory
+ * types must be used. Options for this are `UnboundedAllocator` and the `RMMAllocator`. When using
+ * RMMAllocator, the following memory blocks are needed:
+ *  1. One block of host memory equal in size to a single uncompressed video frame
+ *    is needed. Note that for RMMAllocator, the memory sizes should be specified in MiB, so the
+ *    minimum value can be obtained by:
+ *
+ * ```cpp
+ * #include <cmath>
+ *
+ * ceil(static_cast<double>(height * width * channels * element_size_bytes) / (1024 * 1024));
+ * ```
+ *
+ *  2. One block of device memory equal in size to the host memory block.
+ *
+ * When declaring an RMMAllocator memory pool, `host_memory_initial_size` and
+ * `device_memory_initial_size` must be greater than or equal to the values discussed above.
  */
 class VideoStreamReplayerOp : public holoscan::Operator {
  public:
@@ -73,7 +94,7 @@ class VideoStreamReplayerOp : public holoscan::Operator {
   void setup(OperatorSpec& spec) override;
 
   void initialize() override;
-  void compute(InputContext& op_input, OutputContext& op_output,
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
                ExecutionContext& context) override;
 
  private:
diff --git a/include/holoscan/utils/cuda_stream_handler.hpp b/include/holoscan/utils/cuda_stream_handler.hpp
index fbb30a59..94905fa9 100644
--- a/include/holoscan/utils/cuda_stream_handler.hpp
+++ b/include/holoscan/utils/cuda_stream_handler.hpp
@@ -102,7 +102,18 @@ class CudaStreamHandler {
                            const nvidia::gxf::Expected<nvidia::gxf::Entity>& message);
 
   /**
-   * Get the CUDA stream for the operation from the incoming messages
+   * Get the CUDA stream for the operation from the incoming messages (holoscan::gxf::Entity
+   * variant)
+   *
+   * @param context
+   * @param messages
+   * @return gxf_result_t
+   */
+  gxf_result_t from_messages(gxf_context_t context,
+                             const std::vector<holoscan::gxf::Entity>& messages);
+
+  /**
+   * Get the CUDA stream for the operation from the incoming messages (nvidia::gxf::Entity variant)
    *
    * @param context
    * @param messages
@@ -193,6 +204,9 @@ class CudaStreamHandler {
    */
   gxf_result_t allocate_internal_stream(gxf_context_t context);
 
+  gxf_result_t from_messages(gxf_context_t context, size_t message_count,
+                             const nvidia::gxf::Entity* messages);
+
   /// if set then it's required that the CUDA stream pool is specified, if this is not the case
   /// an error is generated
   bool cuda_stream_pool_required_ = false;
diff --git a/include/holoscan/utils/holoinfer_utils.hpp b/include/holoscan/utils/holoinfer_utils.hpp
index dfce5c24..417aadb9 100644
--- a/include/holoscan/utils/holoinfer_utils.hpp
+++ b/include/holoscan/utils/holoinfer_utils.hpp
@@ -31,6 +31,34 @@ namespace HoloInfer = holoscan::inference;
 
 namespace holoscan::utils {
 
+/**
+ * Buffer wrapping a GXF tensor
+ */
+class GxfTensorBuffer : public HoloInfer::Buffer {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param entity GXF entity holding the tensor
+   * @param tensor GXF tensor
+   */
+  explicit GxfTensorBuffer(const holoscan::gxf::Entity& entity,
+                           const nvidia::gxf::Handle<nvidia::gxf::Tensor>& tensor);
+  GxfTensorBuffer() = delete;
+
+  /// Buffer class virtual members implemented by this class
+  ///@{
+  void* data() override;
+  size_t size() const override;
+  size_t get_bytes() const override;
+  void resize(size_t number_of_elements) override;
+  ///@}
+
+ private:
+  holoscan::gxf::Entity entity_;
+  nvidia::gxf::Handle<nvidia::gxf::Tensor> tensor_;
+};
+
 /**
  * Extracts data from GXF Receivers.
  *
@@ -42,7 +70,7 @@ namespace holoscan::utils {
  * @param cuda_buffer_out Flag defining the location of output memory (Device or Host)
  * @param module Module that called for data extraction
  * @param context GXF execution context
- * @param cuda_stream_handler Cuda steam handler
+ * @param cuda_stream_handler Cuda stream handler
  * @return GXF result code
  */
 gxf_result_t get_data_per_model(InputContext& op_input, const std::vector<std::string>& in_tensors,
@@ -66,7 +94,7 @@ gxf_result_t get_data_per_model(InputContext& op_input, const std::vector<std::s
  * @param cuda_buffer_out Flag to demonstrate if memory storage of output message is on CUDA
  * @param allocator GXF Memory allocator
  * @param module Module that called for data transmission
- * @param cuda_stream_handler Cuda steam handler
+ * @param cuda_stream_handler Cuda stream handler
  * @return GXF result code
  */
 gxf_result_t transmit_data_per_model(gxf_context_t& cont,
diff --git a/include/holoscan/utils/timer.hpp b/include/holoscan/utils/timer.hpp
index cc2e418a..9245ae75 100644
--- a/include/holoscan/utils/timer.hpp
+++ b/include/holoscan/utils/timer.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -120,7 +120,11 @@ class Timer {
       elapsed_seconds_ =
           std::chrono::duration_cast<std::chrono::duration<double>>(end_ - start_).count();
     }
-    if (is_auto_output_) { print(); }
+    if (is_auto_output_) {
+      try {
+        print();
+      } catch (const std::exception& e) {}  // suppress exceptions from fmt
+    }
   }
 
   /**
diff --git a/include/holoscan/utils/yaml_parser.hpp b/include/holoscan/utils/yaml_parser.hpp
index c4f3a769..49268097 100644
--- a/include/holoscan/utils/yaml_parser.hpp
+++ b/include/holoscan/utils/yaml_parser.hpp
@@ -114,7 +114,7 @@ struct YAMLNodeParser<std::vector<typeT>> {
     std::vector<typeT> result(node.size());
     for (size_t i = 0; i < node.size(); i++) {
       const auto value = YAMLNodeParser<typeT>::parse(node[i]);
-      // TODO: check if value is valid
+      // TODO(unknown): check if value is valid
       result[i] = std::move(value);
     }
     return result;
@@ -140,7 +140,7 @@ struct YAMLNodeParser<std::array<typeT, N>> {
     std::array<typeT, N> result;
     for (size_t i = 0; i < node.size(); i++) {
       const auto value = YAMLNodeParser<typeT>::parse(node[i]);
-      // TODO: check if value is valid
+      // TODO(unknown): check if value is valid
       result[i] = std::move(value);
     }
     return result;
diff --git a/modules/holoinfer/src/CMakeLists.txt b/modules/holoinfer/src/CMakeLists.txt
index 4dd564ef..f8ad6ae0 100644
--- a/modules/holoinfer/src/CMakeLists.txt
+++ b/modules/holoinfer/src/CMakeLists.txt
@@ -17,76 +17,16 @@
 
 project(holoinfer VERSION ${HOLOSCAN_BUILD_VERSION} LANGUAGES CXX CUDA)
 
-find_package(CUDAToolkit REQUIRED)
-
-if(HOLOSCAN_BUILD_LIBTORCH)
-    # It is necessary to set the TORCH_CUDA_ARCH_LIST explicitly for newer versions of Torch (circa v2.3.0)
-    set(TORCH_CUDA_ARCH_LIST "3.5 5.0 8.0 8.6 8.9 9.0")
-    find_package(Torch REQUIRED)
-    find_package(TorchVision REQUIRED)
-    add_library(holoinfer_torch SHARED infer/torch/core.cpp)
-    add_library(holoscan::infer::torch ALIAS holoinfer_torch)
-    set_target_properties(holoinfer_torch PROPERTIES
-        OUTPUT_NAME holoscan_infer_torch
-        EXPORT_NAME infer::torch
-        SOVERSION ${PROJECT_VERSION_MAJOR}
-        VERSION ${PROJECT_VERSION}
-        INSTALL_RPATH_USE_LINK_PATH true # find libtorch and torchvision in install tree
-    )
-    target_link_libraries(holoinfer_torch
-        PRIVATE
-            torch
-            -Wl,--no-as-needed TorchVision::TorchVision # torch plugin
-            holoscan::logger
-            GXF::core
-            yaml-cpp
-            holoscan_security_flags
-    )
-    target_include_directories(holoinfer_torch
-        PRIVATE
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-            $<BUILD_INTERFACE:${dlpack_SOURCE_DIR}/include>
-            $<BUILD_INTERFACE:${magic_enum_SOURCE_DIR}/include>
-            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
-        )
-endif()
+add_subdirectory(infer)
 
-if(HOLOSCAN_BUILD_ORT)
-    find_package(ONNXRuntime REQUIRED)
-    add_library(holoinfer_ort SHARED infer/onnx/core.cpp)
-    add_library(holoscan::infer::onnx_runtime ALIAS holoinfer_ort)
-    set_target_properties(holoinfer_ort PROPERTIES
-        OUTPUT_NAME holoscan_infer_onnx_runtime
-        EXPORT_NAME infer::onnx_runtime
-        SOVERSION ${PROJECT_VERSION_MAJOR}
-        VERSION ${PROJECT_VERSION}
-        INSTALL_RPATH_USE_LINK_PATH true # find onnxruntime in install tree
-    )
-    target_link_libraries(holoinfer_ort
-        PRIVATE
-            ONNXRuntime::ONNXRuntime
-            CUDA::cudart
-            holoscan::logger
-            GXF::core
-            yaml-cpp
-            holoscan_security_flags
-    )
-    target_include_directories(holoinfer_ort
-        PRIVATE
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
-        )
-endif()
+find_package(CUDAToolkit REQUIRED)
 
 set(holoinfer_src
     infer/trt/core.cpp
     infer/trt/utils.cpp
     params/infer_param.cpp
     process/data_processor.cpp
+    process/data_processor.cu
     process/transforms/generate_boxes.cpp
     manager/infer_manager.cpp
     manager/process_manager.cpp
@@ -119,6 +59,7 @@ target_link_libraries(${PROJECT_NAME}
         ${CMAKE_DL_LIBS} # to dlopen holoinfer_torch
         CUDA::cudart
         holoscan::logger
+        TensorRT::nvinfer_plugin
         TensorRT::nvonnxparser
         holoscan_security_flags
 )
diff --git a/modules/holoinfer/src/include/holoinfer.hpp b/modules/holoinfer/src/include/holoinfer.hpp
index 45456e99..e8d66836 100644
--- a/modules/holoinfer/src/include/holoinfer.hpp
+++ b/modules/holoinfer/src/include/holoinfer.hpp
@@ -14,8 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef _HOLOSCAN_INFER_API_H
-#define _HOLOSCAN_INFER_API_H
+#ifndef MODULES_HOLOINFER_SRC_INCLUDE_HOLOINFER_HPP
+#define MODULES_HOLOINFER_SRC_INCLUDE_HOLOINFER_HPP
 
 #include <iostream>
 #include <map>
@@ -28,6 +28,8 @@
 namespace holoscan {
 namespace inference {
 
+class ManagerProcessor;
+
 /**
  * Inference Context class
  */
@@ -46,13 +48,17 @@ class _HOLOSCAN_EXTERNAL_API_ InferContext {
 
   /**
    * Executes the inference
-   * Toolkit supports one input per model, in float32 type
+   * Toolkit supports one input per model, in float32 type.
+   * The provided CUDA stream is used to prepare the input data and will be used to operate on the
+   * output data, any execution of CUDA work should be in sync with this stream.
    *
    * @param inference_specs   Pointer to inference specifications
+   * @param cuda_stream CUDA stream
    *
    * @return InferStatus with appropriate holoinfer_code and message.
    */
-  InferStatus execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs);
+  InferStatus execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs,
+                                cudaStream_t cuda_stream = 0);
 
   /**
    * Gets output dimension per model
@@ -100,12 +106,15 @@ class _HOLOSCAN_EXTERNAL_API_ ProcessorContext {
    * output as a vector of float32 type
    * @param dimension_map Map is updated with model name as key mapped to dimension of processed
    * data as a vector
+   * @param process_with_cuda Flag defining if processing should be done with CUDA
+   * @param cuda_stream CUDA stream to use when procseeing is done with CUDA
    *
    * @return InferStatus with appropriate holoinfer_code and message.
    */
   InferStatus process(const MultiMappings& tensor_oper_map, const MultiMappings& in_out_tensor_map,
                       DataMap& processed_result_map,
-                      const std::map<std::string, std::vector<int>>& dimension_map);
+                      const std::map<std::string, std::vector<int>>& dimension_map,
+                      bool process_with_cuda, cudaStream_t cuda_stream = 0);
 
   /**
    * Get output data per Tensor
@@ -122,9 +131,13 @@ class _HOLOSCAN_EXTERNAL_API_ ProcessorContext {
    * @return Map of model as key mapped to the output dimension (of processed data) as a vector
    */
   DimType get_processed_data_dims() const;
+
+ private:
+  /// Pointer to manager class for multi data processing
+  std::shared_ptr<ManagerProcessor> process_manager_;
 };
 
 }  // namespace inference
 }  // namespace holoscan
 
-#endif
+#endif /* MODULES_HOLOINFER_SRC_INCLUDE_HOLOINFER_HPP */
diff --git a/modules/holoinfer/src/include/holoinfer_buffer.hpp b/modules/holoinfer/src/include/holoinfer_buffer.hpp
index adfd8565..8fc83736 100644
--- a/modules/holoinfer/src/include/holoinfer_buffer.hpp
+++ b/modules/holoinfer/src/include/holoinfer_buffer.hpp
@@ -14,8 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef HOLOINFER_SRC_INCLUDE_HOLOINFER_BUFFER_HPP
-#define HOLOINFER_SRC_INCLUDE_HOLOINFER_BUFFER_HPP
+#ifndef MODULES_HOLOINFER_SRC_INCLUDE_HOLOINFER_BUFFER_HPP
+#define MODULES_HOLOINFER_SRC_INCLUDE_HOLOINFER_BUFFER_HPP
 
 #include <cuda_runtime_api.h>
 #include <sys/stat.h>
@@ -65,125 +65,187 @@ class DeviceFree {
 };
 
 /**
- * @brief Cuda Device Buffer Class
+ * Base class for a buffer containing typed data.
  */
-class DeviceBuffer {
+class Buffer {
  public:
   /**
    * @brief Construction with default type
    *
    * @param type Data type, defaults to float32
+   * @param device_id GPU device ID, defaults to 0
    */
-  explicit DeviceBuffer(holoinfer_datatype type = holoinfer_datatype::h_Float32);
+  explicit Buffer(holoinfer_datatype type = holoinfer_datatype::h_Float32, int device_id = 0)
+      : type_(type), device_id_(device_id) {}
 
-  /**
-   * @brief Construction with type and size
-   *
-   * @param size memory size to be allocated
-   * @param type Data type to be allocated
-   */
-  DeviceBuffer(size_t size, holoinfer_datatype type);
+  virtual ~Buffer() = default;
 
   /**
    * @brief Get the data buffer
    *
    * @return Void pointer to the buffer
    */
-  void* data();
+  virtual void* data() = 0;
 
   /**
-   * @brief Get the size of the allocated buffer
+   * @brief Get the size of the allocated buffer in elements
    *
-   * @return size
+   * @return size in elements
    */
-  size_t size() const;
+  virtual size_t size() const = 0;
 
   /**
    * @brief Get the bytes allocated
    *
    * @return allocated bytes
    */
-  size_t get_bytes() const;
+  virtual size_t get_bytes() const = 0;
 
   /**
-   * @brief Resize the underlying buffer
+   * @brief Resize the underlying buffer, this is a no-op if the buffer is already large enough.
    *
    * @param number_of_elements Number of elements to be resized with
    */
-  void resize(size_t number_of_elements);
+  virtual void resize(size_t number_of_elements) = 0;
+
+  /**
+   * @brief Get the datatype
+   *
+   * @return datatype
+   */
+  holoinfer_datatype get_datatype() const { return type_; }
+
+  /**
+   * @brief Get the device ID
+   *
+   * @return device ID
+   */
+  int get_device() const { return device_id_; }
+
+ protected:
+  /// Datatype of the elements in the buffer
+  holoinfer_datatype type_;
+  /// Device ID
+  int device_id_;
+};
+
+/**
+ * @brief Cuda Device Buffer Class
+ */
+class DeviceBuffer : public Buffer {
+ public:
+  /**
+   * @brief Construction with default type
+   *
+   * @param type Data type, defaults to float32
+   * @param device_id GPU device ID, defaults to 0
+   */
+  explicit DeviceBuffer(holoinfer_datatype type = holoinfer_datatype::h_Float32, int device_id = 0);
+
+  /**
+   * @brief Construction with type and size
+   *
+   * @param size memory size to be allocated
+   * @param type Data type to be allocated
+   */
+  DeviceBuffer(size_t size, holoinfer_datatype type);
 
   /**
    * @brief Destructor
    */
   ~DeviceBuffer();
 
+  /// Buffer class virtual members implemented by this class
+  ///@{
+  void* data() override;
+  size_t size() const override;
+  size_t get_bytes() const override;
+  void resize(size_t number_of_elements) override;
+  ///@}
+
  private:
   size_t size_{0}, capacity_{0};
-  holoinfer_datatype type_ = holoinfer_datatype::h_Float32;
   void* buffer_ = nullptr;
   DeviceAllocator allocator_;
   DeviceFree free_;
 };
 
-class HostBuffer {
+class HostBuffer : public Buffer {
  public:
-  /// @brief Constructor
-  /// @param data_type  data type of the buffer
+  /**
+   * @brief Constructor
+   *
+   * @param data_type  data type of the buffer
+   */
   explicit HostBuffer(holoinfer_datatype data_type = holoinfer_datatype::h_Float32)
-      : type_(data_type) {}
+      : Buffer(data_type, -1) {}
 
-  /// @brief Get the buffer data on the host
-  /// @return void pointer to the buffer
-  void* data() { return static_cast<void*>(buffer_.data()); }
-
-  /// @brief Get the number of elements in the buffer
-  /// @return size
-  size_t size() const { return number_of_elements_; }
+  /// Buffer class virtual members implemented by this class
+  ///@{
+  void* data() override;
+  size_t size() const override;
+  size_t get_bytes() const override;
+  void resize(size_t number_of_elements) override;
+  ///@}
 
   /// @brief Set the data type and resize the buffer
   /// @param in_type input data type
-  void set_type(holoinfer_datatype in_type) {
-    type_ = in_type;
-    resize(size());
-  }
-
-  /// @brief Resize the underlying buffer on host
-  /// @param number_of_elements Number of elements to be resized with
-  void resize(size_t number_of_elements) {
-    buffer_.clear();
-    number_of_elements_ = number_of_elements;
-    buffer_.resize(number_of_elements * get_element_size(type_));
-  }
+  void set_type(holoinfer_datatype in_type);
 
  private:
   /// @brief Data buffer on host, stored as a vector of bytes
   std::vector<byte> buffer_;
   /// @brief Number of elements in the buffer
   size_t number_of_elements_{0};
-  /// @brief Datatype of the elements in the buffer
-  holoinfer_datatype type_;
 };
 
 /**
- * @brief HoloInfer DataBuffer Class. Holds CPU based buffer as float32 vector and device buffer as
- * a shared pointer.
+ * @brief HoloInfer DataBuffer Class. Holds CPU based buffer and device buffer.
  */
 class DataBuffer {
  public:
   /**
    * @brief Constructor
+   *
+   * @param type Data type, defaults to float32
+   * @param device_id GPU device ID, defaults to 0
    */
   explicit DataBuffer(holoinfer_datatype data_type = holoinfer_datatype::h_Float32,
                       int device_id = 0);
-  std::shared_ptr<DeviceBuffer> device_buffer;
-  HostBuffer host_buffer;
 
-  holoinfer_datatype get_datatype() const { return type_; }
-  int get_device() const { return device_id_; }
+  std::shared_ptr<Buffer> device_buffer_;
+  std::shared_ptr<Buffer> host_buffer_;
+
+  holoinfer_datatype get_datatype() const { return host_buffer_->get_datatype(); }
 
  private:
-  holoinfer_datatype type_;
-  int device_id_;
+  /// Helper class for backwards compatibility, forwards calls to other buffer class.
+  class BufferForward : public Buffer {
+   public:
+    explicit BufferForward(std::shared_ptr<Buffer>& buffer) : buffer_(buffer) {}
+    BufferForward() = delete;
+
+    /// Buffer class virtual members implemented by this class
+    ///@{
+    void* data() override { return buffer_->data(); }
+    size_t size() const override { return buffer_->size(); }
+    size_t get_bytes() const override { return buffer_->get_bytes(); }
+    void resize(size_t number_of_elements) override { buffer_->resize(number_of_elements); }
+    ///@}
+
+   private:
+    const std::shared_ptr<Buffer>& buffer_;
+  };
+
+ public:
+  /// @deprecated since 2.6, use `device_buffer_` instead
+  const std::shared_ptr<BufferForward> device_buffer{
+      std::make_shared<BufferForward>(device_buffer_)};
+  /// @deprecated since 2.6, use `host_buffer_` instead
+  BufferForward host_buffer{host_buffer_};
+
+ private:
+  holoinfer_datatype data_type_;
 };
 
 using DataMap = std::map<std::string, std::shared_ptr<DataBuffer>>;
@@ -208,6 +270,8 @@ struct InferenceSpecs {
    * @param device_map Map with model name as key, GPU ID for inference as value
    * @param temporal_map Map with model name as key, frame number to skip for inference as value
    * @param activation_map Map with key as model name and activation state for inference as value
+   * @param trt_opt_profile Vector of values for TensorRT optimization profile during engine
+   * creation
    * @param is_engine_path Input path to model is trt engine
    * @param oncpu Perform inference on CPU
    * @param parallel_proc Perform parallel inference of multiple models
@@ -218,9 +282,9 @@ struct InferenceSpecs {
   InferenceSpecs(const std::string& backend, const Mappings& backend_map,
                  const Mappings& model_path_map, const MultiMappings& pre_processor_map,
                  const MultiMappings& inference_map, const Mappings& device_map,
-                 const Mappings& temporal_map, const Mappings& activation_map, bool is_engine_path,
-                 bool oncpu, bool parallel_proc, bool use_fp16, bool cuda_buffer_in,
-                 bool cuda_buffer_out)
+                 const Mappings& temporal_map, const Mappings& activation_map,
+                 const std::vector<int32_t>& trt_opt_profile, bool is_engine_path, bool oncpu,
+                 bool parallel_proc, bool use_fp16, bool cuda_buffer_in, bool cuda_buffer_out)
       : backend_type_(backend),
         backend_map_(backend_map),
         model_path_map_(model_path_map),
@@ -229,6 +293,7 @@ struct InferenceSpecs {
         device_map_(device_map),
         temporal_map_(temporal_map),
         activation_map_(activation_map),
+        trt_opt_profile_(trt_opt_profile),
         is_engine_path_(is_engine_path),
         oncuda_(!oncpu),
         parallel_processing_(parallel_proc),
@@ -272,7 +337,7 @@ struct InferenceSpecs {
    */
   void set_activation_map(const Mappings& activation_map) {
     for (const auto& [key, value] : activation_map) {
-      if (activation_map_.find(key) != activation_map.end()) { activation_map_.at(key) = value; }
+      if (activation_map_.find(key) != activation_map_.end()) { activation_map_.at(key) = value; }
     }
   }
 
@@ -300,6 +365,9 @@ struct InferenceSpecs {
   /// @brief Map with key as model name and activation state for inference as value
   Mappings activation_map_;
 
+  /// @brief TensorRT optimization profile during engine creation for dynamic inputs
+  std::vector<int32_t> trt_opt_profile_;
+
   /// @brief Flag showing if input model path is path to engine files
   bool is_engine_path_ = false;
 
@@ -343,4 +411,4 @@ InferStatus allocate_buffers(DataMap& buffers, std::vector<int64_t>& dims,
 }  // namespace inference
 }  // namespace holoscan
 
-#endif /* HOLOINFER_SRC_INCLUDE_HOLOINFER_BUFFER_HPP */
+#endif /* MODULES_HOLOINFER_SRC_INCLUDE_HOLOINFER_BUFFER_HPP */
diff --git a/modules/holoinfer/src/infer/CMakeLists.txt b/modules/holoinfer/src/infer/CMakeLists.txt
new file mode 100644
index 00000000..278ab115
--- /dev/null
+++ b/modules/holoinfer/src/infer/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(HOLOSCAN_BUILD_ORT)
+  add_subdirectory(onnx)
+endif()
+if(HOLOSCAN_BUILD_LIBTORCH)
+  add_subdirectory(torch)
+endif()
diff --git a/modules/holoinfer/src/infer/infer.hpp b/modules/holoinfer/src/infer/infer.hpp
index 4501963f..d8b9a1be 100644
--- a/modules/holoinfer/src/infer/infer.hpp
+++ b/modules/holoinfer/src/infer/infer.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -37,12 +37,19 @@ class InferBase {
 
   /**
    * @brief Does the Core inference
+   * The provided CUDA data event is used to prepare the input data any execution of CUDA work
+   * should be in sync with this event. If the inference is using CUDA it should record a CUDA
+   * event and pass it back in `cuda_event_inference`.
+   *
    * @param input_data Input DataBuffer
    * @param output_buffer Output DataBuffer, is populated with inferred results
+   * @param cuda_event_data CUDA event recorded after data transfer
+   * @param cuda_event_inference CUDA event recorded after inference
    * @return InferStatus
    * */
   virtual InferStatus do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_data,
-                                   std::vector<std::shared_ptr<DataBuffer>>& output_buffer) {
+                                   std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                                   cudaEvent_t cuda_event_data, cudaEvent_t *cuda_event_inference) {
     return InferStatus();
   }
 
diff --git a/modules/holoinfer/src/infer/onnx/CMakeLists.txt b/modules/holoinfer/src/infer/onnx/CMakeLists.txt
new file mode 100644
index 00000000..4d03ef1f
--- /dev/null
+++ b/modules/holoinfer/src/infer/onnx/CMakeLists.txt
@@ -0,0 +1,45 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+find_package(ONNXRuntime REQUIRED)
+add_library(holoinfer_ort SHARED core.cpp)
+add_library(holoscan::infer::onnx_runtime ALIAS holoinfer_ort)
+set_target_properties(holoinfer_ort PROPERTIES
+    OUTPUT_NAME holoscan_infer_onnx_runtime
+    EXPORT_NAME infer::onnx_runtime
+    SOVERSION ${PROJECT_VERSION_MAJOR}
+    VERSION ${PROJECT_VERSION}
+    INSTALL_RPATH_USE_LINK_PATH true # find onnxruntime in install tree
+)
+target_link_libraries(holoinfer_ort
+    PRIVATE
+        ONNXRuntime::ONNXRuntime
+        CUDA::cudart
+        holoscan::logger
+        GXF::core
+        yaml-cpp
+        holoscan_security_flags
+)
+target_include_directories(holoinfer_ort
+    PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../..>
+    PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include>
+        $<BUILD_INTERFACE:${dlpack_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
+    )
diff --git a/modules/holoinfer/src/infer/onnx/core.cpp b/modules/holoinfer/src/infer/onnx/core.cpp
index 4e18715f..1eb28f2c 100644
--- a/modules/holoinfer/src/infer/onnx/core.cpp
+++ b/modules/holoinfer/src/infer/onnx/core.cpp
@@ -24,6 +24,8 @@
 #include <utility>
 #include <vector>
 
+#include <holoinfer_utils.hpp>
+
 namespace holoscan {
 namespace inference {
 
@@ -79,7 +81,8 @@ class OnnxInferImpl {
 
   // Wrapped Public APIs
   InferStatus do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_buffer,
-                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer);
+                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                           cudaEvent_t cuda_event_data, cudaEvent_t* cuda_event_inference);
   void populate_model_details();
   void print_model_details();
   int set_holoscan_inf_onnx_session_options();
@@ -96,7 +99,7 @@ Ort::Value create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer,
   size_t input_tensor_size = accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
 
   return Ort::Value::CreateTensor<T>(memory_info_,
-                                     static_cast<T*>(input_buffer->host_buffer.data()),
+                                     static_cast<T*>(input_buffer->host_buffer_->data()),
                                      input_tensor_size,
                                      dims.data(),
                                      dims.size());
@@ -105,7 +108,7 @@ Ort::Value create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer,
 template <typename T>
 void transfer_to_host(std::shared_ptr<DataBuffer>& output_buffer, Ort::Value& output_tensor,
                       const size_t& output_tensor_size) {
-  memcpy(output_buffer->host_buffer.data(),
+  memcpy(output_buffer->host_buffer_->data(),
          output_tensor.GetTensorMutableData<T>(),
          output_tensor_size * sizeof(T));
 }
@@ -294,16 +297,21 @@ void OnnxInferImpl::transfer_to_output(std::vector<std::shared_ptr<DataBuffer>>&
 }
 
 InferStatus OnnxInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_buffer,
-                                    std::vector<std::shared_ptr<DataBuffer>>& output_buffer) {
-  return impl_->do_inference(input_buffer, output_buffer);
+                                    std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                                    cudaEvent_t cuda_event_data,
+                                    cudaEvent_t* cuda_event_inference) {
+  return impl_->do_inference(input_buffer, output_buffer, cuda_event_data, cuda_event_inference);
 }
 
 InferStatus OnnxInferImpl::do_inference(
     const std::vector<std::shared_ptr<DataBuffer>>& input_buffer,
-    std::vector<std::shared_ptr<DataBuffer>>& output_buffer) {
+    std::vector<std::shared_ptr<DataBuffer>>& output_buffer, cudaEvent_t cuda_event_data,
+    cudaEvent_t* cuda_event_inference) {
   InferStatus status = InferStatus(holoinfer_code::H_ERROR);
 
   try {
+    check_cuda(cudaEventSynchronize(cuda_event_data));
+
     input_tensors_.clear();
     output_tensors_.clear();
 
@@ -317,7 +325,7 @@ InferStatus OnnxInferImpl::do_inference(
     }
 
     for (size_t a = 0; a < input_buffer.size(); a++) {
-      if (input_buffer[a]->host_buffer.size() == 0) {
+      if (input_buffer[a]->host_buffer_->size() == 0) {
         status.set_message("ONNX inference core: Input Host buffer empty.");
         return status;
       }
@@ -332,7 +340,7 @@ InferStatus OnnxInferImpl::do_inference(
     }
 
     for (unsigned int a = 0; a < output_buffer.size(); a++) {
-      if (output_buffer[a]->host_buffer.size() == 0) {
+      if (output_buffer[a]->host_buffer_->size() == 0) {
         status.set_message("ONNX inference core: Output Host buffer empty.");
         return status;
       }
diff --git a/modules/holoinfer/src/infer/onnx/core.hpp b/modules/holoinfer/src/infer/onnx/core.hpp
index edcab077..5d5840ae 100644
--- a/modules/holoinfer/src/infer/onnx/core.hpp
+++ b/modules/holoinfer/src/infer/onnx/core.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -55,12 +55,17 @@ class OnnxInfer : public InferBase {
   /**
    * @brief Does the Core inference using Onnxruntime. Input and output buffer are supported on
    * Host. Inference is supported on host and device.
+   * The provided CUDA data event is used to prepare the input data any execution of CUDA work
+   * should be in sync with this event. If the inference is using CUDA it should record a CUDA
+   * event and pass it back in `cuda_event_inference`.
+   *
    * @param input_data Input DataBuffer
    * @param output_buffer Output DataBuffer, is populated with inferred results
    * @return InferStatus
    * */
   InferStatus do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_data,
-                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer);
+                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                           cudaEvent_t cuda_event_data, cudaEvent_t *cuda_event_inference);
 
   /**
    * @brief Populate class parameters with model details and values
diff --git a/modules/holoinfer/src/infer/torch/CMakeLists.txt b/modules/holoinfer/src/infer/torch/CMakeLists.txt
new file mode 100644
index 00000000..82ff29bd
--- /dev/null
+++ b/modules/holoinfer/src/infer/torch/CMakeLists.txt
@@ -0,0 +1,50 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# It is necessary to set the TORCH_CUDA_ARCH_LIST explicitly for newer versions of Torch (circa v2.3.0)
+# Note: `find_package(Torch)` is overwriting CUDA_ARCHITECTURES, make sure to pay attention to this
+# when adding CUDA kernels to this the Torch backend library.
+set(TORCH_CUDA_ARCH_LIST "5.0 8.0 8.6 8.9 9.0")
+find_package(Torch REQUIRED)
+find_package(TorchVision REQUIRED)
+add_library(holoinfer_torch SHARED core.cpp)
+add_library(holoscan::infer::torch ALIAS holoinfer_torch)
+set_target_properties(holoinfer_torch PROPERTIES
+    OUTPUT_NAME holoscan_infer_torch
+    EXPORT_NAME infer::torch
+    SOVERSION ${PROJECT_VERSION_MAJOR}
+    VERSION ${PROJECT_VERSION}
+    INSTALL_RPATH_USE_LINK_PATH true # find libtorch and torchvision in install tree
+)
+target_link_libraries(holoinfer_torch
+    PRIVATE
+        torch
+        -Wl,--no-as-needed TorchVision::TorchVision # torch plugin
+        holoscan::logger
+        GXF::core
+        yaml-cpp
+        holoscan_security_flags
+)
+target_include_directories(holoinfer_torch
+    PRIVATE
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../..>
+    PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include>
+        $<BUILD_INTERFACE:${dlpack_SOURCE_DIR}/include>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}>
+    )
diff --git a/modules/holoinfer/src/infer/torch/core.cpp b/modules/holoinfer/src/infer/torch/core.cpp
index fee5395e..539b8f4e 100644
--- a/modules/holoinfer/src/infer/torch/core.cpp
+++ b/modules/holoinfer/src/infer/torch/core.cpp
@@ -35,6 +35,7 @@ class TorchInferImpl {
  public:
   TorchInferImpl(const std::string& model_file_path, bool cuda_flag, bool cuda_buf_in,
                  bool cuda_buf_out);
+  ~TorchInferImpl();
 
   std::string model_path_{""};
   size_t input_nodes_{0}, output_nodes_{0};
@@ -58,8 +59,9 @@ class TorchInferImpl {
   torch::DeviceType input_device_;
   torch::DeviceType output_device_;
 
-  c10::cuda::CUDAStream infer_stream = c10::cuda::getStreamFromPool();
-  std::unique_ptr<c10::cuda::CUDAStreamGuard> stream_guard;
+  c10::cuda::CUDAStream infer_stream_ = c10::cuda::getStreamFromPool();
+  std::unique_ptr<c10::cuda::CUDAStreamGuard> stream_guard_;
+  cudaEvent_t cuda_event_ = nullptr;
 
   void print_model_details();
 
@@ -84,6 +86,16 @@ torch::Tensor create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer
     return torch::empty({0});
   }
 
+  if (input_device == torch::kCPU) {
+    if (input_buffer->host_buffer_->size() != input_tensor_size) {
+      HOLOSCAN_LOG_ERROR("Torch: Input host buffer size mismatch.");
+      return torch::empty({0});
+    }
+  } else if (input_buffer->device_buffer_->size() != input_tensor_size) {
+    HOLOSCAN_LOG_ERROR("Torch: Input device buffer size mismatch.");
+    return torch::empty({0});
+  }
+
   int64_t width = dims[dims.size() - 1], height = dims[dims.size() - 2],
           channels = dims[dims.size() - 3];
 
@@ -94,11 +106,11 @@ torch::Tensor create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer
   if (input_device == torch::kCPU) {
     if (infer_device == torch::kCPU) {
       std::memcpy(tensor.data_ptr(),
-                  reinterpret_cast<void*>(input_buffer->host_buffer.data()),
+                  reinterpret_cast<void*>(input_buffer->host_buffer_->data()),
                   input_tensor_size * sizeof(T));
     } else {
       auto cstatus = cudaMemcpyAsync(tensor.data_ptr(),
-                                     reinterpret_cast<void*>(input_buffer->host_buffer.data()),
+                                     reinterpret_cast<void*>(input_buffer->host_buffer_->data()),
                                      input_tensor_size * sizeof(T),
                                      cudaMemcpyHostToDevice,
                                      cstream);
@@ -106,16 +118,14 @@ torch::Tensor create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer
         HOLOSCAN_LOG_ERROR("Torch: HtoD transfer failed: {}", cudaGetErrorString(cstatus));
         return torch::empty({0});
       }
-      cstatus = cudaStreamSynchronize(cstream);
-      if (cstatus != cudaSuccess) {
-        HOLOSCAN_LOG_ERROR("Cuda stream synchronization failed: {}", cudaGetErrorString(cstatus));
-        return torch::empty({0});
-      }
+      // When copying from pagable memory to device memory cudaMemcpyAsync() is copying the memory
+      // to staging memory first and therefore is synchronous with the host execution. No need to
+      // synchronize here.
     }
   } else {
     if (infer_device == torch::kCPU) {
       auto cstatus = cudaMemcpyAsync(tensor.data_ptr(),
-                                     reinterpret_cast<void*>(input_buffer->device_buffer->data()),
+                                     reinterpret_cast<void*>(input_buffer->device_buffer_->data()),
                                      input_tensor_size * sizeof(T),
                                      cudaMemcpyDeviceToHost,
                                      cstream);
@@ -123,14 +133,11 @@ torch::Tensor create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer
         HOLOSCAN_LOG_ERROR("Torch: DtoH transfer failed: {}", cudaGetErrorString(cstatus));
         return torch::empty({0});
       }
-      cstatus = cudaStreamSynchronize(cstream);
-      if (cstatus != cudaSuccess) {
-        HOLOSCAN_LOG_ERROR("Cuda stream synchronization failed: {}", cudaGetErrorString(cstatus));
-        return torch::empty({0});
-      }
+      // When copying from device memory to pagable memory the call is synchronous with the host
+      // execution. No need to synchronize here.
     } else {
       auto cstatus = cudaMemcpyAsync(tensor.data_ptr(),
-                                     input_buffer->device_buffer->data(),
+                                     input_buffer->device_buffer_->data(),
                                      input_tensor_size * sizeof(T),
                                      cudaMemcpyDeviceToDevice,
                                      cstream);
@@ -149,7 +156,7 @@ torch::Tensor create_tensor_core(const std::shared_ptr<DataBuffer>& input_buffer
 torch::Tensor TorchInferImpl::create_tensor(const std::shared_ptr<DataBuffer>& input_buffer,
                                             const std::vector<int64_t>& dims) {
   auto data_type = input_buffer->get_datatype();
-  auto cstream = infer_stream.stream();
+  auto cstream = infer_stream_.stream();
 
   switch (data_type) {
     case holoinfer_datatype::h_Float32:
@@ -183,9 +190,9 @@ InferStatus transfer_from_tensor(std::shared_ptr<DataBuffer>& output_buffer,
                                  cudaStream_t cstream) {
   size_t output_tensor_size = output_tensor.numel();
   if (output_device == torch::kCUDA) {
-    output_buffer->device_buffer->resize(output_tensor_size);
+    output_buffer->device_buffer_->resize(output_tensor_size);
   } else {
-    output_buffer->host_buffer.resize(output_tensor_size);
+    output_buffer->host_buffer_->resize(output_tensor_size);
   }
 
   // Populate dims for data transmission
@@ -198,11 +205,11 @@ InferStatus transfer_from_tensor(std::shared_ptr<DataBuffer>& output_buffer,
 
   if (output_device == torch::kCPU) {
     if (infer_device == torch::kCPU) {
-      memcpy(output_buffer->host_buffer.data(),
+      memcpy(output_buffer->host_buffer_->data(),
              output_tensor.data_ptr(),
              output_tensor_size * sizeof(T));
     } else {
-      auto cstatus = cudaMemcpyAsync(output_buffer->host_buffer.data(),
+      auto cstatus = cudaMemcpyAsync(output_buffer->host_buffer_->data(),
                                      output_tensor.data_ptr(),
                                      output_tensor_size * sizeof(T),
                                      cudaMemcpyDeviceToHost,
@@ -211,16 +218,12 @@ InferStatus transfer_from_tensor(std::shared_ptr<DataBuffer>& output_buffer,
         HOLOSCAN_LOG_ERROR("Torch: DtoH transfer failed: {}", cudaGetErrorString(cstatus));
         return InferStatus(holoinfer_code::H_ERROR, "Torch core, DtoH transfer.");
       }
-      cstatus = cudaStreamSynchronize(cstream);
-      if (cstatus != cudaSuccess) {
-        HOLOSCAN_LOG_ERROR("Torch: Cuda stream synchronization failed: {}",
-                           cudaGetErrorString(cstatus));
-        return InferStatus(holoinfer_code::H_ERROR, "Torch core, Stream synchronization.");
-      }
+      // When copying from device memory to pagable memory the call is synchronous with the host
+      // execution. No need to synchronize here.
     }
   } else {
     if (infer_device == torch::kCPU) {
-      auto cstatus = cudaMemcpyAsync(output_buffer->device_buffer->data(),
+      auto cstatus = cudaMemcpyAsync(output_buffer->device_buffer_->data(),
                                      output_tensor.data_ptr(),
                                      output_tensor_size * sizeof(T),
                                      cudaMemcpyHostToDevice,
@@ -229,14 +232,11 @@ InferStatus transfer_from_tensor(std::shared_ptr<DataBuffer>& output_buffer,
         HOLOSCAN_LOG_ERROR("Torch: HtoD transfer failed: {}", cudaGetErrorString(cstatus));
         return InferStatus(holoinfer_code::H_ERROR, "Torch core, HtoD transfer.");
       }
-      cstatus = cudaStreamSynchronize(cstream);
-      if (cstatus != cudaSuccess) {
-        HOLOSCAN_LOG_ERROR("Torch: Cuda stream synchronization failed: {}",
-                           cudaGetErrorString(cstatus));
-        return InferStatus(holoinfer_code::H_ERROR, "Torch core, Stream synchronization.");
-      }
+      // When copying from pagable memory to device memory cudaMemcpyAsync() is copying the memory
+      // to staging memory first and therefore is synchronous with the host execution. No need to
+      // synchronize here.
     } else {
-      auto cstatus = cudaMemcpyAsync(output_buffer->device_buffer->data(),
+      auto cstatus = cudaMemcpyAsync(output_buffer->device_buffer_->data(),
                                      output_tensor.data_ptr(),
                                      output_tensor_size * sizeof(T),
                                      cudaMemcpyDeviceToDevice,
@@ -255,7 +255,7 @@ InferStatus TorchInferImpl::transfer_to_output(
     const size_t& index) {
   auto data_type = output_buffer[index]->get_datatype();
   out_torch_tensor = out_torch_tensor.contiguous().flatten();
-  auto cstream = infer_stream.stream();
+  auto cstream = infer_stream_.stream();
 
   switch (data_type) {
     case holoinfer_datatype::h_Float32:
@@ -404,8 +404,9 @@ TorchInferImpl::TorchInferImpl(const std::string& model_file_path, bool cuda_fla
                                bool cuda_buf_out)
     : model_path_(model_file_path) {
   try {
-    infer_stream = c10::cuda::getStreamFromPool(true);
-    stream_guard = std::make_unique<c10::cuda::CUDAStreamGuard>(infer_stream);
+    infer_stream_ = c10::cuda::getStreamFromPool(true);
+    stream_guard_ = std::make_unique<c10::cuda::CUDAStreamGuard>(infer_stream_);
+    check_cuda(cudaEventCreateWithFlags(&cuda_event_, cudaEventDisableTiming));
 
     auto status = populate_model_details();
     if (status.get_code() != holoinfer_code::H_SUCCESS) {
@@ -437,11 +438,21 @@ TorchInferImpl::TorchInferImpl(const std::string& model_file_path, bool cuda_fla
   } catch (...) { throw; }
 }
 
+TorchInferImpl::~TorchInferImpl() {
+  if (cuda_event_) { cudaEventDestroy(cuda_event_); }
+}
+
 InferStatus TorchInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_buffer,
-                                     std::vector<std::shared_ptr<DataBuffer>>& output_buffer) {
+                                     std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                                     cudaEvent_t cuda_event_data,
+                                     cudaEvent_t* cuda_event_inference) {
   InferStatus status = InferStatus(holoinfer_code::H_ERROR);
 
-  impl_->stream_guard->reset_stream(impl_->infer_stream);
+  // synchronize the CUDA stream used for inference with the CUDA event recorded when preparing
+  // the input data
+  check_cuda(cudaStreamWaitEvent(impl_->infer_stream_.stream(), cuda_event_data));
+
+  impl_->stream_guard_->reset_stream(impl_->infer_stream_);
 
   if (impl_->input_nodes_ != input_buffer.size()) {
     status.set_message("Torch inference core: Input buffer size not equal to input nodes.");
@@ -458,11 +469,6 @@ InferStatus TorchInfer::do_inference(const std::vector<std::shared_ptr<DataBuffe
     impl_->inputs_.clear();
 
     for (size_t a = 0; a < input_buffer.size(); a++) {
-      if (input_buffer[a]->host_buffer.size() == 0) {
-        status.set_message("Torch inference core: Input Host buffer empty.");
-        return status;
-      }
-
       auto i_tensor = impl_->create_tensor(input_buffer[a], impl_->input_dims_[a]);
 
       if (i_tensor.numel() == 0) {
@@ -514,7 +520,7 @@ InferStatus TorchInfer::do_inference(const std::vector<std::shared_ptr<DataBuffe
               torch::Tensor current_tensor = dict_outputs.at(impl_->output_names_[a]).toTensor();
               auto status = impl_->transfer_to_output(output_buffer, std::move(current_tensor), a);
               if (status.get_code() != holoinfer_code::H_SUCCESS) {
-                HOLOSCAN_LOG_ERROR("Transfer of Tensor {} failed in inferece core.",
+                HOLOSCAN_LOG_ERROR("Transfer of Tensor {} failed in inference core.",
                                    impl_->output_names_[a]);
                 return status;
               }
@@ -556,11 +562,17 @@ InferStatus TorchInfer::do_inference(const std::vector<std::shared_ptr<DataBuffe
       for (unsigned int a = 0; a < output_buffer.size(); a++) {
         torch::Tensor current_tensor = impl_->output_tensors_[a];
         auto status = impl_->transfer_to_output(output_buffer, std::move(current_tensor), a);
-        HOLOSCAN_LOG_ERROR("Transfer of Tensor {} failed in inferece core.",
-                           impl_->output_names_[a]);
-        return status;
+        if (status.get_code() != holoinfer_code::H_SUCCESS) {
+          HOLOSCAN_LOG_ERROR("Transfer of Tensor {} failed in inference core.",
+                             impl_->output_names_[a]);
+          return status;
+        }
       }
     }
+
+    // record a CUDA event and pass it back to the caller
+    check_cuda(cudaEventRecord(impl_->cuda_event_, impl_->infer_stream_.stream()));
+    *cuda_event_inference = impl_->cuda_event_;
   } catch (const c10::Error& exception) {
     HOLOSCAN_LOG_ERROR(exception.what());
     throw;
diff --git a/modules/holoinfer/src/infer/torch/core.hpp b/modules/holoinfer/src/infer/torch/core.hpp
index 76d3549b..f3166782 100644
--- a/modules/holoinfer/src/infer/torch/core.hpp
+++ b/modules/holoinfer/src/infer/torch/core.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,12 +60,17 @@ class TorchInfer : public InferBase {
 
   /**
    * @brief Does the Core inference.
+   * The provided CUDA data event is used to prepare the input data any execution of CUDA work
+   * should be in sync with this event. If the inference is using CUDA it should record a CUDA
+   * event and pass it back in `cuda_event_inference`.
+   *
    * @param input_data Vector of Input DataBuffer
    * @param output_buffer Vector of Output DataBuffer, is populated with inferred results
    * @return InferStatus
    * */
   InferStatus do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_data,
-                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer);
+                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                           cudaEvent_t cuda_event_data, cudaEvent_t *cuda_event_inference);
 
   /**
    * @brief Populate class parameters with model details and values
diff --git a/modules/holoinfer/src/infer/trt/core.cpp b/modules/holoinfer/src/infer/trt/core.cpp
index 388d0ba3..b4c691d9 100644
--- a/modules/holoinfer/src/infer/trt/core.cpp
+++ b/modules/holoinfer/src/infer/trt/core.cpp
@@ -24,18 +24,35 @@
 #include <utility>
 #include <vector>
 
+#include "holoinfer_utils.hpp"
+
 namespace holoscan {
 namespace inference {
 
-TrtInfer::TrtInfer(const std::string& model_path, const std::string& model_name, int device_id,
+TrtInfer::TrtInfer(const std::string& model_path, const std::string& model_name,
+                   const std::vector<int32_t>& trt_opt_profile, int device_id, int device_id_dt,
                    bool enable_fp16, bool is_engine_path, bool cuda_buf_in, bool cuda_buf_out)
     : model_path_(model_path),
       model_name_(model_name),
+      trt_opt_profile_(trt_opt_profile),
       device_id_(device_id),
       enable_fp16_(enable_fp16),
       is_engine_path_(is_engine_path),
       cuda_buf_in_(cuda_buf_in),
       cuda_buf_out_(cuda_buf_out) {
+  if (trt_opt_profile.size() != 3) {
+    HOLOSCAN_LOG_WARN(
+        "TRT Inference: Optimization profile must of of size 3. Size from inference parameters: "
+        "{}",
+        trt_opt_profile.size());
+    HOLOSCAN_LOG_INFO("Input optimization profile ignored. Using default optimization profile");
+  } else {
+    // set the network optimization profile for dynamic inputs
+    network_options_.batch_sizes[0] = trt_opt_profile_[0];
+    network_options_.batch_sizes[1] = trt_opt_profile_[1];
+    network_options_.batch_sizes[2] = trt_opt_profile_[2];
+  }
+
   // Set the device index
   network_options_.device_index = device_id_;
 
@@ -61,6 +78,15 @@ TrtInfer::TrtInfer(const std::string& model_path, const std::string& model_name,
     engine_path_ = model_path_;
   }
 
+  check_cuda(cudaSetDevice(device_id_));
+  // Create the CUDA stream with the non-blocking flags set. This is needed for CUDA stream
+  // capturing since capturing fails if another thread is scheduling work to stream '0' while
+  // we capture in this thread. We explicitly synchronize with the caller using events so stream
+  // '0' does not need to sync with us.
+  check_cuda(cudaStreamCreateWithFlags(&cuda_stream_, cudaStreamNonBlocking));
+  // create the CUDA event used to synchronize with the caller
+  check_cuda(cudaEventCreateWithFlags(&cuda_event_, cudaEventDisableTiming));
+
   bool status = load_engine();
   if (!status) { throw std::runtime_error("TRT Inference: failed to load TRT engine file."); }
 
@@ -72,6 +98,8 @@ TrtInfer::~TrtInfer() {
   if (context_) { context_.reset(); }
   if (engine_) { engine_.reset(); }
   if (cuda_stream_) { cudaStreamDestroy(cuda_stream_); }
+  if (cuda_event_) { cudaEventDestroy(cuda_event_); }
+  if (cuda_graph_instance_) { cudaGraphExecDestroy(cuda_graph_instance_); }
   if (infer_runtime_) { infer_runtime_.reset(); }
 }
 
@@ -113,12 +141,6 @@ bool TrtInfer::load_engine() {
     return false;
   }
 
-  status = cudaStreamCreate(&cuda_stream_);
-  if (status != 0) {
-    HOLOSCAN_LOG_ERROR("Load Engine: Cuda stream creation failed.");
-    throw std::runtime_error("Unable to create cuda stream");
-  }
-
   HOLOSCAN_LOG_INFO("Engine loaded: {}", engine_path_);
   return true;
 }
@@ -222,49 +244,50 @@ bool TrtInfer::initialize_parameters() {
 }
 
 InferStatus TrtInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_buffers,
-                                   std::vector<std::shared_ptr<DataBuffer>>& output_buffers) {
+                                   std::vector<std::shared_ptr<DataBuffer>>& output_buffers,
+                                   cudaEvent_t cuda_event_data, cudaEvent_t* cuda_event_inference) {
   InferStatus status = InferStatus(holoinfer_code::H_ERROR);
   auto io_index = 0;
 
+  // synchronize the CUDA stream used for inference with the CUDA event recorded when preparing
+  // the input data
+  check_cuda(cudaStreamWaitEvent(cuda_stream_, cuda_event_data));
+
   for (auto& input_buffer : input_buffers) {
-    if (input_buffer->device_buffer == nullptr) {
+    if (input_buffer->device_buffer_ == nullptr) {
       status.set_message(" TRT inference core: Input Device buffer is null.");
       return status;
     }
 
-    if (input_buffer->device_buffer->data() == nullptr) {
+    if (input_buffer->device_buffer_->data() == nullptr) {
       status.set_message(" TRT inference core: Data in Input Device buffer is null.");
       return status;
     }
 
-    //  Host to Device transfer
+    // Host to Device transfer
     if (!cuda_buf_in_) {
-      if (input_buffer->host_buffer.size() == 0) {
+      if (input_buffer->host_buffer_->size() == 0) {
         status.set_message(" TRT inference core: Empty input host buffer.");
         return status;
       }
-      if (input_buffer->device_buffer->size() != input_buffer->host_buffer.size()) {
+
+      if (input_buffer->device_buffer_->size() != input_buffer->host_buffer_->size()) {
         status.set_message(" TRT inference core: Input Host and Device buffer size mismatch.");
         return status;
       }
-      auto cstatus = cudaMemcpyAsync(input_buffer->device_buffer->data(),
-                                     input_buffer->host_buffer.data(),
-                                     input_buffer->device_buffer->get_bytes(),
+
+      auto cstatus = cudaMemcpyAsync(input_buffer->device_buffer_->data(),
+                                     input_buffer->host_buffer_->data(),
+                                     input_buffer->device_buffer_->get_bytes(),
                                      cudaMemcpyHostToDevice,
                                      cuda_stream_);
       if (cstatus != cudaSuccess) {
         status.set_message(" TRT inference core: Host to device transfer failed.");
         return status;
       }
-      cstatus = cudaStreamSynchronize(cuda_stream_);
-      if (cstatus != cudaSuccess) {
-        status.set_message(" TRT: Cuda stream synchronization failed");
-        return status;
-      }
-      if (input_buffer->device_buffer->size() == 0) {
-        status.set_message(" TRT inference core: Input Device buffer size is 0.");
-        return status;
-      }
+      // When copying from pagable memory to device memory cudaMemcpyAsync() is copying the memory
+      // to staging memory first and therefore is synchronous with the host execution. No need to
+      // synchronize here.
     }
     auto tensor_name = engine_->getIOTensorName(io_index++);
     if (engine_->getTensorIOMode(tensor_name) != nvinfer1::TensorIOMode::kINPUT) {
@@ -272,7 +295,7 @@ InferStatus TrtInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>
       status.set_message(" TRT inference core: Incorrect input tensor name.");
       return status;
     }
-    auto set_flag = context_->setTensorAddress(tensor_name, input_buffer->device_buffer->data());
+    auto set_flag = context_->setTensorAddress(tensor_name, input_buffer->device_buffer_->data());
 
     if (!set_flag) {
       HOLOSCAN_LOG_ERROR("Buffer binding failed for {} in inference core.", tensor_name);
@@ -282,16 +305,16 @@ InferStatus TrtInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>
   }
 
   for (auto& output_buffer : output_buffers) {
-    if (output_buffer->device_buffer == nullptr) {
+    if (output_buffer->device_buffer_ == nullptr) {
       status.set_message(" TRT inference core: Output Device buffer is null.");
       return status;
     }
-    if (output_buffer->device_buffer->data() == nullptr) {
+    if (output_buffer->device_buffer_->data() == nullptr) {
       status.set_message(" TRT inference core: Data in Output Device buffer is null.");
       return status;
     }
 
-    if (output_buffer->device_buffer->size() == 0) {
+    if (output_buffer->device_buffer_->size() == 0) {
       status.set_message(" TRT inference core: Output Device buffer size is 0.");
       return status;
     }
@@ -302,7 +325,7 @@ InferStatus TrtInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>
       return status;
     }
 
-    auto set_flag = context_->setTensorAddress(tensor_name, output_buffer->device_buffer->data());
+    auto set_flag = context_->setTensorAddress(tensor_name, output_buffer->device_buffer_->data());
 
     if (!set_flag) {
       HOLOSCAN_LOG_ERROR("Buffer binding failed for {} in inference core.", tensor_name);
@@ -311,47 +334,87 @@ InferStatus TrtInfer::do_inference(const std::vector<std::shared_ptr<DataBuffer>
     }
   }
 
-  bool infer_status = false;
-  infer_status = context_->enqueueV3(cuda_stream_);
+  bool capturing_graph = false;
+  if (use_cuda_graph_) {
+    // TRT works in two phases, the first phase can't be capture. Start capturing after the
+    // first phase.
+    // See https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#cuda-graphs for
+    // more information.
+    if (!first_phase_) {
+      check_cuda(cudaStreamBeginCapture(cuda_stream_, cudaStreamCaptureModeThreadLocal));
+      capturing_graph = true;
+    }
+    first_phase_ = false;
+  }
 
+  const bool infer_status = context_->enqueueV3(cuda_stream_);
   if (!infer_status) {
+    if (capturing_graph) {
+      // end the capture and destroy the graph when inference failed and we are capturing
+      cudaGraph_t cuda_graph = nullptr;
+      check_cuda(cudaStreamEndCapture(cuda_stream_, &cuda_graph));
+      check_cuda(cudaGraphDestroy(cuda_graph));
+    }
     status.set_message(" TRT inference core: Inference failure.");
     return status;
   }
 
+  if (capturing_graph) {
+    cudaGraph_t cuda_graph = nullptr;
+    check_cuda(cudaStreamEndCapture(cuda_stream_, &cuda_graph));
+
+    // If we've already instantiated the graph, try to update it directly and avoid the
+    // instantiation overhead
+    cudaGraphExecUpdateResultInfo update_result;
+    if (cuda_graph_instance_) {
+      check_cuda(cudaGraphExecUpdate(cuda_graph_instance_, cuda_graph, &update_result));
+    }
+
+    // Instantiate during the first iteration or whenever the update fails for any reason
+    if (!cuda_graph_instance_ || (update_result.result != cudaGraphExecUpdateSuccess)) {
+      // If a previous update failed, destroy the cudaGraphExec_t before re-instantiating it
+      if (cuda_graph_instance_ != NULL) { check_cuda(cudaGraphExecDestroy(cuda_graph_instance_)); }
+
+      // Instantiate graphExec from graph. The error node and error message parameters are unused
+      // here.
+      check_cuda(cudaGraphInstantiate(&cuda_graph_instance_, cuda_graph, nullptr, nullptr, 0));
+    }
+
+    check_cuda(cudaGraphDestroy(cuda_graph));
+
+    // now launch the graph
+    check_cuda(cudaGraphLaunch(cuda_graph_instance_, cuda_stream_));
+  }
+
   if (!cuda_buf_out_) {
     for (auto& output_buffer : output_buffers) {
-      if (output_buffer->host_buffer.size() == 0) {
+      if (output_buffer->host_buffer_->size() == 0) {
         status.set_message(" TRT inference core: Empty output host buffer.");
         return status;
       }
-      if (output_buffer->device_buffer->size() != output_buffer->host_buffer.size()) {
+      if (output_buffer->device_buffer_->size() != output_buffer->host_buffer_->size()) {
         status.set_message(" TRT inference core: Output Host and Device buffer size mismatch.");
         return status;
       }
       // Copy the results back to CPU memory
-      auto cstatus = cudaMemcpyAsync(output_buffer->host_buffer.data(),
-                                     output_buffer->device_buffer->data(),
-                                     output_buffer->device_buffer->get_bytes(),
+      auto cstatus = cudaMemcpyAsync(output_buffer->host_buffer_->data(),
+                                     output_buffer->device_buffer_->data(),
+                                     output_buffer->device_buffer_->get_bytes(),
                                      cudaMemcpyDeviceToHost,
                                      cuda_stream_);
       if (cstatus != cudaSuccess) {
         status.set_message(" TRT: Device to host transfer failed");
         return status;
       }
-      cstatus = cudaStreamSynchronize(cuda_stream_);
-      if (cstatus != cudaSuccess) {
-        status.set_message(" TRT: Cuda stream synchronization failed");
-        return status;
-      }
+      // When copying from device memory to pagable memory the call is synchronous with the host
+      // execution. No need to synchronize here.
     }
   }
 
-  auto cstatus = cudaStreamSynchronize(cuda_stream_);
-  if (cstatus != cudaSuccess) {
-    status.set_message(" TRT: Cuda stream synchronization failed");
-    return status;
-  }
+  // record a CUDA event and pass it back to the caller
+  check_cuda(cudaEventRecord(cuda_event_, cuda_stream_));
+  *cuda_event_inference = cuda_event_;
+
   return InferStatus();
 }
 
diff --git a/modules/holoinfer/src/infer/trt/core.hpp b/modules/holoinfer/src/infer/trt/core.hpp
index 01013720..f8e9a1ae 100644
--- a/modules/holoinfer/src/infer/trt/core.hpp
+++ b/modules/holoinfer/src/infer/trt/core.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,7 +36,8 @@ class TrtInfer : public InferBase {
   /**
    * @brief Constructor
    */
-  TrtInfer(const std::string& model_path, const std::string& model_name, int device_id,
+  TrtInfer(const std::string& model_path, const std::string& model_name,
+           const std::vector<int32_t>& trt_opt_profile, int device_id, int device_id_dt,
            bool enable_fp16, bool is_engine_path, bool cuda_buf_in, bool cuda_buf_out);
 
   /**
@@ -46,12 +47,19 @@ class TrtInfer : public InferBase {
 
   /**
    * @brief Does the Core inference with TRT backend
+   * The provided CUDA data event is used to prepare the input data any execution of CUDA work
+   * should be in sync with this event. If the inference is using CUDA it should record a CUDA
+   * event and pass it back in `cuda_event_inference`.
+   *
    * @param input_data Input DataBuffer
    * @param output_buffer Output DataBuffer, is populated with inferred results
+   * @param cuda_event_data CUDA event recorded after data transfer
+   * @param cuda_event_inference CUDA event recorded after inference
    * @return InferStatus
    * */
   InferStatus do_inference(const std::vector<std::shared_ptr<DataBuffer>>& input_data,
-                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer);
+                           std::vector<std::shared_ptr<DataBuffer>>& output_buffer,
+                           cudaEvent_t cuda_event_data, cudaEvent_t *cuda_event_inference);
 
   /**
    * @brief Get input data dimensions to the model
@@ -100,6 +108,9 @@ class TrtInfer : public InferBase {
   /// @brief Vector of output data types
   std::vector<holoinfer_datatype> out_data_types_;
 
+  /// @brief Vector of trt optimization profile
+  std::vector<int32_t> trt_opt_profile_;
+
   /// @brief Use FP16 in TRT engine file generation
   bool enable_fp16_;
 
@@ -145,8 +156,17 @@ class TrtInfer : public InferBase {
   /// @brief Generated engine file path. The extension is unique per GPU model
   std::string engine_path_;
 
-  /// @brief Cuda stream
+  /// Cuda stream
   cudaStream_t cuda_stream_ = nullptr;
+  /// CUDA event for device
+  cudaEvent_t cuda_event_ = nullptr;
+
+  /// Use CUDA graphs if set
+  bool use_cuda_graph_ = true;
+  /// This is set when the model is executed the first time, used for CUDA graph logic
+  bool first_phase_ = true;
+  /// CUDA graph instance
+  cudaGraphExec_t cuda_graph_instance_ = nullptr;
 
   /// @brief Inference runtime
   std::unique_ptr<nvinfer1::IRuntime> infer_runtime_;
diff --git a/modules/holoinfer/src/infer/trt/utils.cpp b/modules/holoinfer/src/infer/trt/utils.cpp
index f52b9313..4f0ca2a5 100644
--- a/modules/holoinfer/src/infer/trt/utils.cpp
+++ b/modules/holoinfer/src/infer/trt/utils.cpp
@@ -91,8 +91,7 @@ bool build_engine(const std::string& onnx_model_path, const std::string& engine_
   auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger));
   if (!builder) { return false; }
 
-  auto explicit_batch =
-      1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+  auto explicit_batch = 1;
   auto network =
       std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicit_batch));
   if (!network) { return false; }
diff --git a/modules/holoinfer/src/manager/infer_manager.cpp b/modules/holoinfer/src/manager/infer_manager.cpp
index 744eb969..6082eb2d 100644
--- a/modules/holoinfer/src/manager/infer_manager.cpp
+++ b/modules/holoinfer/src/manager/infer_manager.cpp
@@ -38,6 +38,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
   auto temporal_map = inference_specs->get_temporal_map();
   auto backend_type = inference_specs->backend_type_;
   auto backend_map = inference_specs->get_backend_map();
+  auto trt_opt_profile = inference_specs->trt_opt_profile_;
   cuda_buffer_in_ = inference_specs->cuda_buffer_in_;
   cuda_buffer_out_ = inference_specs->cuda_buffer_out_;
 
@@ -80,11 +81,11 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
   try {
     if (device_map.find("gpu-dt") != device_map.end()) {
       auto dev_id = std::stoi(device_map.at("gpu-dt"));
-      device_gpu_dt = dev_id;
-      HOLOSCAN_LOG_INFO("ID of data transfer GPU updated to: {}", device_gpu_dt);
+      device_gpu_dt_ = dev_id;
+      HOLOSCAN_LOG_INFO("ID of data transfer GPU updated to: {}", device_gpu_dt_);
     }
 
-    unique_gpu_ids.insert(device_gpu_dt);
+    unique_gpu_ids.insert(device_gpu_dt_);
 
     for (auto const& [_, gpu_id] : device_map) {
       auto dev_id = std::stoi(gpu_id);
@@ -113,48 +114,48 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
     for (auto gid = 1; gid < vec_unique_gpu_ids.size(); ++gid) {
       int gpu_access_from_gpudt = 0, gpu_access_to_gpudt = 0;
       check_cuda(
-          cudaDeviceCanAccessPeer(&gpu_access_from_gpudt, device_gpu_dt, vec_unique_gpu_ids[gid]));
+          cudaDeviceCanAccessPeer(&gpu_access_from_gpudt, device_gpu_dt_, vec_unique_gpu_ids[gid]));
       check_cuda(
-          cudaDeviceCanAccessPeer(&gpu_access_to_gpudt, vec_unique_gpu_ids[gid], device_gpu_dt));
+          cudaDeviceCanAccessPeer(&gpu_access_to_gpudt, vec_unique_gpu_ids[gid], device_gpu_dt_));
 
       if (gpu_access_from_gpudt == 1 && gpu_access_to_gpudt == 1) {
         HOLOSCAN_LOG_INFO("Setting GPU P2P access between GPU {} and GPU {}",
-                          device_gpu_dt,
+                          device_gpu_dt_,
                           vec_unique_gpu_ids[gid]);
-        check_cuda(cudaSetDevice(device_gpu_dt));
+        check_cuda(cudaSetDevice(device_gpu_dt_));
         cudaError_t cstatus = cudaDeviceEnablePeerAccess(vec_unique_gpu_ids[gid], 0);
         if (cstatus != cudaSuccess && cstatus != cudaErrorPeerAccessAlreadyEnabled) {
           HOLOSCAN_LOG_ERROR("Cuda error, {}", cudaGetErrorString(cstatus));
           HOLOSCAN_LOG_ERROR("Error enabling P2P access from GPU {} and GPU {}.",
-                             device_gpu_dt,
+                             device_gpu_dt_,
                              vec_unique_gpu_ids[gid]);
           status.set_message("Enabling P2P access failed.");
           return status;
         }
         check_cuda(cudaSetDevice(vec_unique_gpu_ids[gid]));
-        cstatus = cudaDeviceEnablePeerAccess(device_gpu_dt, 0);
+        cstatus = cudaDeviceEnablePeerAccess(device_gpu_dt_, 0);
         if (cstatus != cudaSuccess && cstatus != cudaErrorPeerAccessAlreadyEnabled) {
           HOLOSCAN_LOG_ERROR("Cuda error, {}", cudaGetErrorString(cstatus));
           HOLOSCAN_LOG_ERROR("Error enabling P2P access from GPU {} and GPU {}.",
                              vec_unique_gpu_ids[gid],
-                             device_gpu_dt);
+                             device_gpu_dt_);
           status.set_message("Enabling P2P access failed.");
           return status;
         }
       } else {
         HOLOSCAN_LOG_WARN("P2P access between GPU {} and GPU {} is not available.",
-                          device_gpu_dt,
+                          device_gpu_dt_,
                           vec_unique_gpu_ids[gid]);
         HOLOSCAN_LOG_INFO(
             "There can be any reason related to GPU type, GPU family or system setup (PCIE "
             "configuration).");
         HOLOSCAN_LOG_INFO("May be GPU {} and GPU {} are not in the same PCIE configuration.",
-                          device_gpu_dt,
+                          device_gpu_dt_,
                           vec_unique_gpu_ids[gid]);
         HOLOSCAN_LOG_WARN(
             "Multi GPU inference feature will use Host (CPU memory) to transfer data across GPUs."
             "This may result in an additional latency.");
-        mgpu_p2p_transfer = false;
+        mgpu_p2p_transfer_ = false;
       }
     }
   }
@@ -179,7 +180,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
         return status;
       }
 
-      auto device_id = device_gpu_dt;
+      auto device_id = device_gpu_dt_;
       if (device_map.find(model_name) != device_map.end()) {
         device_id = std::stoi(device_map.at(model_name));
         HOLOSCAN_LOG_INFO("Device id: {} for Model: {}", device_id, model_name);
@@ -237,7 +238,9 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
           holo_infer_context_.insert({model_name,
                                       std::make_unique<TrtInfer>(model_path,
                                                                  model_name,
+                                                                 trt_opt_profile,
                                                                  device_id,
+                                                                 device_gpu_dt_,
                                                                  inference_specs->use_fp16_,
                                                                  inference_specs->is_engine_path_,
                                                                  cuda_buffer_in_,
@@ -337,7 +340,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
           return status;
         }
       }
-      check_cuda(cudaSetDevice(device_gpu_dt));
+      check_cuda(cudaSetDevice(device_gpu_dt_));
 
       auto output_node_size = holo_infer_context_.at(model_name)->get_output_dims().size();
       auto input_node_size = holo_infer_context_.at(model_name)->get_input_dims().size();
@@ -384,7 +387,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
         }
         HOLOSCAN_LOG_INFO("HoloInfer buffer created for {}", out_tensor_names[d]);
 
-        if (device_id != device_gpu_dt) {
+        if (device_id != device_gpu_dt_) {
           check_cuda(cudaSetDevice(device_id));
 
           auto astatus =
@@ -395,23 +398,23 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
             return status;
           }
 
-          check_cuda(cudaSetDevice(device_gpu_dt));
+          check_cuda(cudaSetDevice(device_gpu_dt_));
         }
       }
       mgpu_output_buffer_.insert({model_name, std::move(dm)});
 
-      if (device_id != device_gpu_dt) {
+      if (device_id != device_gpu_dt_) {
         // For Multi-GPU feature: allocate input and output cuda streams
-        check_cuda(cudaSetDevice(device_gpu_dt));
+        check_cuda(cudaSetDevice(device_gpu_dt_));
         std::vector<cudaStream_t> in_streams_gpudt(in_tensor_names.size());
         std::map<std::string, cudaStream_t> in_streams_map_gpudt, out_streams_map_gpudt;
 
-        // cuda stream creation per tensor and populating input_streams_gpudt map
+        // cuda stream creation per tensor and populating input_streams_gpudt_ map
         for (auto in = 0; in < in_tensor_names.size(); in++) {
           check_cuda(cudaStreamCreate(&in_streams_gpudt[in]));
           in_streams_map_gpudt.insert({in_tensor_names[in], in_streams_gpudt[in]});
         }
-        input_streams_gpudt.insert({model_name, std::move(in_streams_map_gpudt)});
+        input_streams_gpudt_.insert({model_name, std::move(in_streams_map_gpudt)});
 
         std::vector<cudaStream_t> out_streams_gpudt(out_tensor_names.size());
         // cuda stream creation per output tensor and populating out_streams_map_gpudt map
@@ -419,7 +422,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
           check_cuda(cudaStreamCreate(&out_streams_gpudt[out]));
           out_streams_map_gpudt.insert({out_tensor_names[out], out_streams_gpudt[out]});
         }
-        output_streams_gpudt.insert({model_name, std::move(out_streams_map_gpudt)});
+        output_streams_gpudt_.insert({model_name, std::move(out_streams_map_gpudt)});
 
         check_cuda(cudaSetDevice(device_id));
         std::vector<cudaStream_t> in_streams_dev(in_tensor_names.size());
@@ -430,7 +433,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
           check_cuda(cudaStreamCreate(&in_streams_dev[in]));
           in_streams_map_dev.insert({in_tensor_names[in], in_streams_dev[in]});
         }
-        input_streams_device.insert({model_name, std::move(in_streams_map_dev)});
+        input_streams_device_.insert({model_name, std::move(in_streams_map_dev)});
 
         std::vector<cudaStream_t> out_streams(out_tensor_names.size());
 
@@ -440,7 +443,7 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
           out_streams_map_dev.insert({out_tensor_names[out], out_streams[out]});
         }
 
-        output_streams_device.insert({model_name, std::move(out_streams_map_dev)});
+        output_streams_device_.insert({model_name, std::move(out_streams_map_dev)});
         // stream allocation ends
 
         // allocate input buffers only for multi-gpu inference use case for allocation on GPUs other
@@ -465,11 +468,25 @@ InferStatus ManagerInfer::set_inference_params(std::shared_ptr<InferenceSpecs>&
           }
         }
         mgpu_input_buffer_.insert({model_name, std::move(dm_in)});
-        check_cuda(cudaSetDevice(device_gpu_dt));
+        check_cuda(cudaSetDevice(device_gpu_dt_));
       }
 
       models_input_dims_.insert({model_name, holo_infer_context_.at(model_name)->get_input_dims()});
+
+      if (vec_unique_gpu_ids.size() > 1) {
+        // create the CUDA event used to synchronize the streams
+        auto event_per_gpu = mgpu_cuda_event_.insert({model_name, {}}).first;
+        cudaEvent_t cuda_event;
+        for (auto&& gid : vec_unique_gpu_ids) {
+          check_cuda(cudaSetDevice(gid));
+          check_cuda(cudaEventCreateWithFlags(&cuda_event, cudaEventDisableTiming));
+          event_per_gpu->second.insert({gid, cuda_event});
+        }
+        check_cuda(cudaSetDevice(device_gpu_dt_));
+      }
     }
+
+    check_cuda(cudaEventCreateWithFlags(&cuda_event_, cudaEventDisableTiming));
   } catch (const std::runtime_error& rt) {
     raise_error("Inference Manager", "Setting Inference parameters: " + std::string(rt.what()));
   } catch (...) {
@@ -487,6 +504,8 @@ void ManagerInfer::cleanup() {
   }
 
   for (auto& [_, infer_p] : infer_param_) { infer_p.reset(); }
+
+  if (cuda_event_) { cudaEventDestroy(cuda_event_); }
 }
 
 ManagerInfer::~ManagerInfer() {
@@ -494,8 +513,9 @@ ManagerInfer::~ManagerInfer() {
 }
 
 InferStatus ManagerInfer::run_core_inference(const std::string& model_name,
-                                             DataMap& input_preprocess_data,
-                                             DataMap& output_inferred_data) {
+                                             const DataMap& input_preprocess_data,
+                                             const DataMap& output_inferred_data,
+                                             cudaStream_t cuda_stream) {
   InferStatus status = InferStatus(holoinfer_code::H_ERROR);
 
   // Find if the current model exists in infer_param_
@@ -518,82 +538,80 @@ InferStatus ManagerInfer::run_core_inference(const std::string& model_name,
     return status;
   }
 
-  auto device_id = infer_param_.at(model_name)->get_device_id();
-  check_cuda(cudaSetDevice(device_id));
+  const auto device_id = infer_param_.at(model_name)->get_device_id();
 
   // input and output buffer for current inference
   std::vector<std::shared_ptr<DataBuffer>> indata, outdata;
 
-  DataMap in_preprocess_data;
-  if (device_id != device_gpu_dt) {
+  for (const auto& in_tensor : input_tensors) {
+    if (input_preprocess_data.find(in_tensor) == input_preprocess_data.end()) {
+      status.set_message("Inference manager, Preprocessed data for tensor " + in_tensor +
+                         " does not exist.");
+      return status;
+    }
+  }
+
+  // Transfer memory from data transfer GPU to inference device. This is using a separate stream
+  // for each tensor and synchronizes the copies with the CUDA stream passed in as a parameter.
+  if (device_id != device_gpu_dt_) {
     if (mgpu_input_buffer_.find(model_name) == mgpu_input_buffer_.end()) {
       HOLOSCAN_LOG_ERROR("Mapping for model {} not found on device {}.", model_name, device_id);
       status.set_message("Inference manager, Mapping not found for " + model_name +
                          " in multi gpu inference.");
       return status;
     }
-    in_preprocess_data = mgpu_input_buffer_.at(model_name);
-  }
 
-  for (const auto& in_tensor : input_tensors) {
-    if (input_preprocess_data.find(in_tensor) == input_preprocess_data.end()) {
-      status.set_message("Inference manager, Preprocessed data for tensor " + in_tensor +
-                         " does not exist.");
-      return status;
-    }
+    check_cuda(cudaSetDevice(device_gpu_dt_));
 
-    //  by default memory mapped for all backends
-    if (device_id != device_gpu_dt) {
-      check_cuda(cudaSetDevice(device_id));
-      auto device_buff = in_preprocess_data.at(in_tensor)->device_buffer->data();
-      auto buffsize = in_preprocess_data.at(in_tensor)->device_buffer->get_bytes();
+    const DataMap& in_preprocess_data = mgpu_input_buffer_.at(model_name);
 
-      check_cuda(cudaSetDevice(device_gpu_dt));
-      auto in_streams_gpudt = input_streams_gpudt.at(model_name);
+    const auto& input_streams_dev = input_streams_device_.at(model_name);
+    const auto& in_streams_gpudt = input_streams_gpudt_.at(model_name);
+    const cudaEvent_t cuda_event_d = mgpu_cuda_event_.at(model_name).at(device_id);
+    const cudaEvent_t cuda_event_dt = mgpu_cuda_event_.at(model_name).at(device_gpu_dt_);
 
-      auto device_gpu_dt_buff_in = input_preprocess_data.at(in_tensor)->device_buffer->data();
-      auto stream = in_streams_gpudt.at(in_tensor);
-      if (mgpu_p2p_transfer) {
+    for (const auto& in_tensor : input_tensors) {
+      const auto device_buff = in_preprocess_data.at(in_tensor)->device_buffer_->data();
+      const auto buffsize = in_preprocess_data.at(in_tensor)->device_buffer_->get_bytes();
+
+      const auto device_gpu_dt_buff_in =
+          input_preprocess_data.at(in_tensor)->device_buffer_->data();
+
+      const cudaStream_t stream_d = input_streams_dev.at(in_tensor);
+      const cudaStream_t stream_dt = in_streams_gpudt.at(in_tensor);
+      check_cuda(cudaEventRecord(cuda_event_dt, cuda_stream));
+      check_cuda(cudaStreamWaitEvent(stream_dt, cuda_event_dt));
+
+      if (mgpu_p2p_transfer_) {
+        // direct p2p transfer
         check_cuda(cudaMemcpyPeerAsync(
-            device_buff, device_id, device_gpu_dt_buff_in, device_gpu_dt, buffsize, stream));
+            device_buff, device_id, device_gpu_dt_buff_in, device_gpu_dt_, buffsize, stream_dt));
+        check_cuda(cudaEventRecord(cuda_event_dt, stream_dt));
+        check_cuda(cudaStreamWaitEvent(cuda_stream, cuda_event_dt));
       } else {
         // transfer from gpu-dt to host
-        auto host_buff_in = input_preprocess_data.at(in_tensor)->host_buffer.data();
+        /// @todo check if using pinned memory is faster
+        input_preprocess_data.at(in_tensor)->host_buffer_->resize(buffsize);
+        auto host_buff_in = input_preprocess_data.at(in_tensor)->host_buffer_->data();
         check_cuda(cudaMemcpyAsync(
-            host_buff_in, device_gpu_dt_buff_in, buffsize, cudaMemcpyDeviceToHost, stream));
-      }
-    } else {
-      indata.push_back(input_preprocess_data.at(in_tensor));
-    }
-  }
-
-  if (device_id != device_gpu_dt) {
-    check_cuda(cudaSetDevice(device_gpu_dt));
-    auto in_streams_gpudt = input_streams_gpudt.at(model_name);
-
-    for (auto& [_, stream] : in_streams_gpudt) { check_cuda(cudaStreamSynchronize(stream)); }
-
-    // If P2P is disabled, transfer data from host to device_id
-    if (!mgpu_p2p_transfer) {
-      check_cuda(cudaSetDevice(device_id));
-
-      // transfer from host to device_id
-      auto input_streams_dev = input_streams_device.at(model_name);
-      for (const auto& in_tensor : input_tensors) {
-        auto device_buff = in_preprocess_data.at(in_tensor)->device_buffer->data();
-        auto host_buff_in = input_preprocess_data.at(in_tensor)->host_buffer.data();
-        auto buffsize = in_preprocess_data.at(in_tensor)->device_buffer->get_bytes();
-        auto dstream = input_streams_dev.at(in_tensor);
+            host_buff_in, device_gpu_dt_buff_in, buffsize, cudaMemcpyDeviceToHost, stream_dt));
+        check_cuda(cudaEventRecord(cuda_event_dt, stream_dt));
 
+        // transfer from host to device_id
+        check_cuda(cudaSetDevice(device_id));
+        check_cuda(cudaStreamWaitEvent(stream_d, cuda_event_dt));
         check_cuda(
-            cudaMemcpyAsync(device_buff, host_buff_in, buffsize, cudaMemcpyHostToDevice, dstream));
+            cudaMemcpyAsync(device_buff, host_buff_in, buffsize, cudaMemcpyHostToDevice, stream_d));
+        check_cuda(cudaEventRecord(cuda_event_d, stream_d));
+        check_cuda(cudaSetDevice(device_gpu_dt_));
+        check_cuda(cudaStreamWaitEvent(cuda_stream, cuda_event_d));
       }
 
-      for (auto& [_, dstream] : input_streams_dev) { check_cuda(cudaStreamSynchronize(dstream)); }
+      indata.push_back(in_preprocess_data.at(in_tensor));
     }
-
+  } else {
     for (const auto& in_tensor : input_tensors) {
-      indata.push_back(in_preprocess_data.at(in_tensor));
+      indata.push_back(input_preprocess_data.at(in_tensor));
     }
   }
 
@@ -603,17 +621,21 @@ InferStatus ManagerInfer::run_core_inference(const std::string& model_name,
       return status;
     }
 
-    if (device_id != device_gpu_dt) {
-      check_cuda(cudaSetDevice(device_id));
-      auto out_inferred_data = mgpu_output_buffer_.at(model_name);
+    if (device_id != device_gpu_dt_) {
+      const DataMap& out_inferred_data = mgpu_output_buffer_.at(model_name);
       outdata.push_back(out_inferred_data.at(out_tensor));
     } else {
       outdata.push_back(output_inferred_data.at(out_tensor));
     }
   }
 
+  check_cuda(cudaEventRecord(cuda_event_, cuda_stream));
+
   check_cuda(cudaSetDevice(device_id));
-  auto i_status = holo_infer_context_.at(model_name)->do_inference(indata, outdata);
+  cudaEvent_t cuda_event_inference = nullptr;
+  auto i_status = holo_infer_context_.at(model_name)
+                      ->do_inference(indata, outdata, cuda_event_, &cuda_event_inference);
+  check_cuda(cudaSetDevice(device_gpu_dt_));
 
   if (i_status.get_code() == holoinfer_code::H_ERROR) {
     i_status.display_message();
@@ -621,70 +643,70 @@ InferStatus ManagerInfer::run_core_inference(const std::string& model_name,
     return status;
   }
 
+  if (cuda_event_inference) { check_cuda(cudaStreamWaitEvent(cuda_stream, cuda_event_inference)); }
+
   // Output data setup after inference
   // by default memory mapped for all backends
-  if (device_id != device_gpu_dt && cuda_buffer_out_) {
-    auto out_inferred_data = mgpu_output_buffer_.at(model_name);
-    auto out_streams = output_streams_device.at(model_name);
+  if ((device_id != device_gpu_dt_) && cuda_buffer_out_) {
+    const DataMap& out_inferred_data = mgpu_output_buffer_.at(model_name);
+    const auto& out_streams = output_streams_device_.at(model_name);
+    const auto& out_streams_gpudt = output_streams_gpudt_.at(model_name);
+    const cudaEvent_t cuda_event_d = mgpu_cuda_event_.at(model_name).at(device_id);
+    const cudaEvent_t cuda_event_dt = mgpu_cuda_event_.at(model_name).at(device_gpu_dt_);
 
     for (auto& out_tensor : output_tensors) {
-      check_cuda(cudaSetDevice(device_id));
-      auto buffsize = out_inferred_data.at(out_tensor)->device_buffer->get_bytes();
+      auto buffsize = out_inferred_data.at(out_tensor)->device_buffer_->get_bytes();
 
-      check_cuda(cudaSetDevice(device_gpu_dt));
-      auto buffer_size_gpu_dt = output_inferred_data.at(out_tensor)->device_buffer->get_bytes();
+      auto buffer_size_gpu_dt = output_inferred_data.at(out_tensor)->device_buffer_->get_bytes();
       if (buffer_size_gpu_dt != buffsize) {
-        output_inferred_data.at(out_tensor)->device_buffer->resize(buffsize);
+        output_inferred_data.at(out_tensor)->device_buffer_->resize(buffsize);
       }
-      auto device_gpu_dt_buff = output_inferred_data.at(out_tensor)->device_buffer->data();
+      auto device_gpu_dt_buff = output_inferred_data.at(out_tensor)->device_buffer_->data();
 
-      check_cuda(cudaSetDevice(device_id));
-      auto device_buff = out_inferred_data.at(out_tensor)->device_buffer->data();
-      buffsize = out_inferred_data.at(out_tensor)->device_buffer->get_bytes();
+      auto device_buff = out_inferred_data.at(out_tensor)->device_buffer_->data();
+      buffsize = out_inferred_data.at(out_tensor)->device_buffer_->get_bytes();
 
-      auto stream = out_streams.at(out_tensor);
-      if (mgpu_p2p_transfer) {
+      const cudaStream_t stream_d = out_streams.at(out_tensor);
+      const cudaStream_t stream_dt = out_streams_gpudt.at(out_tensor);
+      check_cuda(cudaEventRecord(cuda_event_dt, cuda_stream));
+      if (mgpu_p2p_transfer_) {
+        // direct p2p transfer
+        check_cuda(cudaStreamWaitEvent(stream_dt, cuda_event_dt));
         check_cuda(cudaMemcpyPeerAsync(
-            device_gpu_dt_buff, device_gpu_dt, device_buff, device_id, buffsize, stream));
+            device_gpu_dt_buff, device_gpu_dt_, device_buff, device_id, buffsize, stream_dt));
+        check_cuda(cudaEventRecord(cuda_event_dt, stream_dt));
+        check_cuda(cudaStreamWaitEvent(cuda_stream, cuda_event_dt));
       } else {
         // transfer from device to host
-        auto host_buff_out = out_inferred_data.at(out_tensor)->host_buffer.data();
-        check_cuda(
-            cudaMemcpyAsync(host_buff_out, device_buff, buffsize, cudaMemcpyDeviceToHost, stream));
-      }
-    }
-
-    for (auto& [_, stream] : out_streams) { check_cuda(cudaStreamSynchronize(stream)); }
-
-    // if p2p is disabled, then move the data from host to gpu-dt
-    if (!mgpu_p2p_transfer) {
-      check_cuda(cudaSetDevice(device_gpu_dt));
-      auto out_streams_gpudt = output_streams_gpudt.at(model_name);
-
-      // transfer from host to gpu-dt
-      for (auto& out_tensor : output_tensors) {
-        auto device_gpu_dt_buff = output_inferred_data.at(out_tensor)->device_buffer->data();
-        auto host_buff_out = out_inferred_data.at(out_tensor)->host_buffer.data();
-        auto buffsize = output_inferred_data.at(out_tensor)->device_buffer->get_bytes();
-        auto stream = out_streams_gpudt.at(out_tensor);
+        /// @todo check if using pinned memory is faster
+        out_inferred_data.at(out_tensor)->host_buffer_->resize(buffsize);
+        auto host_buff_out = out_inferred_data.at(out_tensor)->host_buffer_->data();
+        check_cuda(cudaSetDevice(device_id));
+        check_cuda(cudaStreamWaitEvent(stream_d, cuda_event_dt));
+        check_cuda(cudaMemcpyAsync(
+            host_buff_out, device_buff, buffsize, cudaMemcpyDeviceToHost, stream_d));
+        check_cuda(cudaEventRecord(cuda_event_d, stream_d));
 
+        // transfer from host to gpu-dt
+        check_cuda(cudaSetDevice(device_gpu_dt_));
+        check_cuda(cudaStreamWaitEvent(stream_dt, cuda_event_d));
         check_cuda(cudaMemcpyAsync(
-            device_gpu_dt_buff, host_buff_out, buffsize, cudaMemcpyHostToDevice, stream));
+            device_buff, host_buff_out, buffsize, cudaMemcpyHostToDevice, stream_dt));
+        check_cuda(cudaEventRecord(cuda_event_dt, stream_dt));
+        check_cuda(cudaStreamWaitEvent(cuda_stream, cuda_event_dt));
       }
-
-      for (auto& [_, stream] : out_streams_gpudt) { check_cuda(cudaStreamSynchronize(stream)); }
     }
   }
 
-  check_cuda(cudaSetDevice(device_gpu_dt));
   return InferStatus();
 }
 
-InferStatus ManagerInfer::execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs) {
+InferStatus ManagerInfer::execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs,
+                                            cudaStream_t cuda_stream) {
   InferStatus status = InferStatus();
 
-  auto permodel_preprocess_data = inference_specs->data_per_tensor_;
-  auto permodel_output_data = inference_specs->output_per_model_;
+  const auto& permodel_preprocess_data = inference_specs->data_per_tensor_;
+  const auto& permodel_output_data = inference_specs->output_per_model_;
 
   if (permodel_preprocess_data.size() == 0) {
     status.set_code(holoinfer_code::H_ERROR);
@@ -715,7 +737,7 @@ InferStatus ManagerInfer::execute_inference(std::shared_ptr<InferenceSpecs>& inf
     if (activation_map.find(model_instance) != activation_map.end()) {
       try {
         auto activation_value = std::stoul(activation_map.at(model_instance));
-        HOLOSCAN_LOG_INFO("Activation value: {} for Model: {}", activation_value, model_instance);
+        HOLOSCAN_LOG_DEBUG("Activation value: {} for Model: {}", activation_value, model_instance);
         if (activation_value > 1) {
           HOLOSCAN_LOG_WARN("Activation map can have either a value of 0 or 1 for a model.");
           HOLOSCAN_LOG_WARN("Activation map value is ignored for model {}", model_instance);
@@ -733,8 +755,8 @@ InferStatus ManagerInfer::execute_inference(std::shared_ptr<InferenceSpecs>& inf
     auto temporal_id = infer_param_.at(model_instance)->get_temporal_id();
     if (process_model && (frame_counter_ % temporal_id == 0)) {
       if (!parallel_processing_) {
-        InferStatus infer_status =
-            run_core_inference(model_instance, permodel_preprocess_data, permodel_output_data);
+        InferStatus infer_status = run_core_inference(
+            model_instance, permodel_preprocess_data, permodel_output_data, cuda_stream);
         if (infer_status.get_code() != holoinfer_code::H_SUCCESS) {
           status.set_code(holoinfer_code::H_ERROR);
           infer_status.display_message();
@@ -749,21 +771,26 @@ InferStatus ManagerInfer::execute_inference(std::shared_ptr<InferenceSpecs>& inf
                                                        this,
                                                        model_instance,
                                                        permodel_preprocess_data,
-                                                       permodel_output_data))});
+                                                       permodel_output_data,
+                                                       cuda_stream))});
       }
     }
   }
 
   if (parallel_processing_) {
+    std::string failed_models;
     for (auto& inf_fut : inference_futures) {
       InferStatus infer_status = inf_fut.second.get();
       if (infer_status.get_code() != holoinfer_code::H_SUCCESS) {
         status.set_code(holoinfer_code::H_ERROR);
         infer_status.display_message();
-        status.set_message("Inference manager, Inference failed in execution for " + inf_fut.first);
-        return status;
+        failed_models += " " + inf_fut.first;
       }
     }
+    if (status.get_code() != holoinfer_code::H_SUCCESS) {
+        status.set_message("Inference manager, Inference failed in execution for" + failed_models);
+        return status;
+    }
   }
 
   // update output dimensions here for dynamic outputs
@@ -798,7 +825,8 @@ InferContext::InferContext() {
   } catch (const std::bad_alloc&) { throw; }
 }
 
-InferStatus InferContext::execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs) {
+InferStatus InferContext::execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs,
+                                            cudaStream_t cuda_stream) {
   InferStatus status = InferStatus();
 
   if (g_managers.find(unique_id_) == g_managers.end()) {
@@ -810,7 +838,7 @@ InferStatus InferContext::execute_inference(std::shared_ptr<InferenceSpecs>& inf
   try {
     g_manager = g_managers.at(unique_id_);
 
-    status = g_manager->execute_inference(inference_specs);
+    status = g_manager->execute_inference(inference_specs, cuda_stream);
   } catch (const std::exception& e) {
     status.set_code(holoinfer_code::H_ERROR);
     status.set_message(std::string("Inference manager, Error in inference setup: ") + e.what());
diff --git a/modules/holoinfer/src/manager/infer_manager.hpp b/modules/holoinfer/src/manager/infer_manager.hpp
index 0ee3cce0..71cb5d8f 100644
--- a/modules/holoinfer/src/manager/infer_manager.hpp
+++ b/modules/holoinfer/src/manager/infer_manager.hpp
@@ -71,23 +71,32 @@ class ManagerInfer {
   /**
    * @brief Prepares and launches single/multiple inference
    *
+   * The provided CUDA stream is used to prepare the input data and will be used to operate on the
+   * output data, any execution of CUDA work should be in sync with this stream.
+   *
    * @param inference_specs specifications for inference
+   * @param cuda_stream CUDA stream
    *
    * @return InferStatus with appropriate code and message
    */
-  InferStatus execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs);
+  InferStatus execute_inference(std::shared_ptr<InferenceSpecs>& inference_specs,
+                                cudaStream_t cuda_stream);
 
   /**
    * @brief Executes Core inference for a particular model and generates inferred data
+   * The provided CUDA stream is used to prepare the input data and will be used to operate on the
+   * output data, any execution of CUDA work should be in sync with this stream.
    *
    * @param model_name Input model to do the inference on
    * @param permodel_preprocess_data Input DataMap with model name as key and DataBuffer as value
    * @param permodel_output_data Output DataMap with tensor name as key and DataBuffer as value
+   * @param cuda_stream CUDA stream
    *
    * @return InferStatus with appropriate code and message
    */
-  InferStatus run_core_inference(const std::string& model_name, DataMap& permodel_preprocess_data,
-                                 DataMap& permodel_output_data);
+  InferStatus run_core_inference(const std::string& model_name,
+                                 const DataMap& permodel_preprocess_data,
+                                 const DataMap& permodel_output_data, cudaStream_t cuda_stream);
   /**
    * @brief Cleans up internal context per model
    *
@@ -119,23 +128,27 @@ class ManagerInfer {
   bool cuda_buffer_out_ = false;
 
   /// @brief Flag to demonstrate if multi-GPU feature has Peer to Peer transfer enabled.
-  bool mgpu_p2p_transfer = true;
+  bool mgpu_p2p_transfer_ = true;
 
   /// @brief Map to store cuda streams associated with each input tensor in each model on GPU-dt.
   /// Will be used with Multi-GPU feature.
-  std::map<std::string, std::map<std::string, cudaStream_t>> input_streams_gpudt;
+  std::map<std::string, std::map<std::string, cudaStream_t>> input_streams_gpudt_;
 
   /// @brief Map to store cuda streams associated with each output tensor in each model on GPU-dt.
   /// Will be used with Multi-GPU feature.
-  std::map<std::string, std::map<std::string, cudaStream_t>> output_streams_gpudt;
+  std::map<std::string, std::map<std::string, cudaStream_t>> output_streams_gpudt_;
 
   /// @brief Map to store cuda streams associated with each input tensor in each model on the
   /// inference device.  Will be used with Multi-GPU feature.
-  std::map<std::string, std::map<std::string, cudaStream_t>> input_streams_device;
+  std::map<std::string, std::map<std::string, cudaStream_t>> input_streams_device_;
 
   /// @brief Map to store cuda streams associated with each output tensor in each model on the
   /// inference device. Will be used with Multi-GPU feature.
-  std::map<std::string, std::map<std::string, cudaStream_t>> output_streams_device;
+  std::map<std::string, std::map<std::string, cudaStream_t>> output_streams_device_;
+
+  /// @brief Map to store a CUDA event for each device for each model. Will be used with Multi-GPU
+  /// feature.
+  std::map<std::string, std::map<int, cudaEvent_t>> mgpu_cuda_event_;
 
   /// Map storing parameters per model
   std::map<std::string, std::unique_ptr<Params>> infer_param_;
@@ -156,7 +169,10 @@ class ManagerInfer {
   unsigned int frame_counter_ = 0;
 
   /// Data transfer GPU. Default: 0. Not configurable in this release.
-  int device_gpu_dt = 0;
+  int device_gpu_dt_ = 0;
+
+  /// CUDA event on data transfer GPU, used to synchronize inference execution with data transfer.
+  cudaEvent_t cuda_event_ = nullptr;
 
   /// Map storing inferred output dimension per tensor
   DimType models_output_dims_;
diff --git a/modules/holoinfer/src/manager/process_manager.cpp b/modules/holoinfer/src/manager/process_manager.cpp
index c8e4acb5..8d43c385 100644
--- a/modules/holoinfer/src/manager/process_manager.cpp
+++ b/modules/holoinfer/src/manager/process_manager.cpp
@@ -63,7 +63,7 @@ InferStatus ManagerProcessor::process_multi_tensor_operation(
         HOLOSCAN_LOG_ERROR("Tensor {} not found in dimension map.", tensor);
         return status;
       }
-      void* input_data = inferred_result_map.at(tensor)->host_buffer.data();
+      void* input_data = inferred_result_map.at(tensor)->host_buffer_->data();
       const std::vector<int> dimensions = dimension_map.at(tensor);
       all_tensor_data[tensor] = input_data;
       all_tensor_dims[tensor] = dimensions;
@@ -79,9 +79,12 @@ InferStatus ManagerProcessor::process_multi_tensor_operation(
   return InferStatus();
 }
 
-InferStatus ManagerProcessor::process(
-    const MultiMappings& tensor_oper_map, const MultiMappings& in_out_tensor_map,
-    DataMap& inferred_result_map, const std::map<std::string, std::vector<int>>& dimension_map) {
+InferStatus ManagerProcessor::process(const MultiMappings& tensor_oper_map,
+                                      const MultiMappings& in_out_tensor_map,
+                                      DataMap& inferred_result_map,
+                                      const std::map<std::string, std::vector<int>>& dimension_map,
+                                      bool process_with_cuda,
+                                      cudaStream_t cuda_stream) {
   for (const auto& current_tensor_operation : tensor_oper_map) {
     auto& tensor_name = current_tensor_operation.first;
     auto operations = current_tensor_operation.second;
@@ -117,7 +120,13 @@ InferStatus ManagerProcessor::process(
             "Process manager, Dimension map does not contain results from " + tensor_name);
       }
 
-      void* input_data = inferred_result_map.at(tensor_name)->host_buffer.data();
+      void* input_data;
+      if (process_with_cuda) {
+        input_data = inferred_result_map.at(tensor_name)->device_buffer_->data();
+      } else {
+        input_data = inferred_result_map.at(tensor_name)->host_buffer_->data();
+      }
+
       const std::vector<int> dimensions = dimension_map.at(tensor_name);
 
       for (auto& operation : operations) {
@@ -183,7 +192,9 @@ InferStatus ManagerProcessor::process(
                                                             processed_dims,
                                                             processed_data_map_,
                                                             out_tensor_names,
-                                                            custom_strings);
+                                                            custom_strings,
+                                                            process_with_cuda,
+                                                            cuda_stream);
 
         if (status.get_code() != holoinfer_code::H_SUCCESS) {
           status.display_message();
@@ -217,7 +228,7 @@ DimType ManagerProcessor::get_processed_data_dims() const {
 
 ProcessorContext::ProcessorContext() {
   try {
-    process_manager = std::make_unique<ManagerProcessor>();
+    process_manager_ = std::make_shared<ManagerProcessor>();
   } catch (const std::bad_alloc&) {
     HOLOSCAN_LOG_ERROR("Holoscan Outdata context: Memory allocation error.");
     throw;
@@ -225,24 +236,30 @@ ProcessorContext::ProcessorContext() {
 }
 
 DimType ProcessorContext::get_processed_data_dims() const {
-  return process_manager->get_processed_data_dims();
+  return process_manager_->get_processed_data_dims();
 }
 
 DataMap ProcessorContext::get_processed_data() const {
-  return process_manager->get_processed_data();
+  return process_manager_->get_processed_data();
 }
 
 InferStatus ProcessorContext::process(const MultiMappings& tensor_to_oper_map,
                                       const MultiMappings& in_out_tensor_map,
                                       DataMap& inferred_result_map,
-                                      const std::map<std::string, std::vector<int>>& model_dims) {
-  return process_manager->process(
-      tensor_to_oper_map, in_out_tensor_map, inferred_result_map, model_dims);
+                                      const std::map<std::string, std::vector<int>>& model_dims,
+                                      bool process_with_cuda,
+                                      cudaStream_t cuda_stream) {
+  return process_manager_->process(tensor_to_oper_map,
+                                  in_out_tensor_map,
+                                  inferred_result_map,
+                                  model_dims,
+                                  process_with_cuda,
+                                  cuda_stream);
 }
 
 InferStatus ProcessorContext::initialize(const MultiMappings& process_operations,
                                          const std::string config_path = {}) {
-  return process_manager->initialize(process_operations, config_path);
+  return process_manager_->initialize(process_operations, config_path);
 }
 
 }  // namespace inference
diff --git a/modules/holoinfer/src/manager/process_manager.hpp b/modules/holoinfer/src/manager/process_manager.hpp
index 009641f4..5f45c87b 100644
--- a/modules/holoinfer/src/manager/process_manager.hpp
+++ b/modules/holoinfer/src/manager/process_manager.hpp
@@ -65,11 +65,14 @@ class ManagerProcessor {
    * @param inferred_result_map Map with output tensor name as key, and related DataBuffer as
    * value
    * @param dimension_map Map with tensor name as key and related output dimension as value.
+   * @param process_with_cuda Flag defining if processing should be done with CUDA
+   * @param cuda_stream CUDA stream to use when processing is done with CUDA
    * @return InferStatus with appropriate code and message
    */
   InferStatus process(const MultiMappings& tensor_oper_map, const MultiMappings& in_out_tensor_map,
                       DataMap& inferred_result_map,
-                      const std::map<std::string, std::vector<int>>& dimension_map);
+                      const std::map<std::string, std::vector<int>>& dimension_map,
+                      bool process_with_cuda, cudaStream_t cuda_stream);
 
   /*
    * @brief Executes post processing operations for multi tensor I/O
@@ -110,9 +113,6 @@ class ManagerProcessor {
   DimType processed_dims_map_;
 };
 
-/// Pointer to manager class for multi data processing
-std::unique_ptr<ManagerProcessor> process_manager;
-
 }  // namespace inference
 }  // namespace holoscan
 
diff --git a/modules/holoinfer/src/process/data_processor.cpp b/modules/holoinfer/src/process/data_processor.cpp
index 1996a8fe..b7204a84 100644
--- a/modules/holoinfer/src/process/data_processor.cpp
+++ b/modules/holoinfer/src/process/data_processor.cpp
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "data_processor.hpp"
 
 #include <functional>
@@ -240,7 +241,7 @@ InferStatus DataProcessor::scale_intensity_cpu(const std::vector<int>& dimension
         {out_tensor_name, std::make_shared<DataBuffer>(holoinfer_datatype::h_UInt8)});
 
     // allocate memory for the first time
-    processed_data_map.at(out_tensor_name)->host_buffer.resize(dsize * channels);
+    processed_data_map.at(out_tensor_name)->host_buffer_->resize(dsize * channels);
 
     //  Data in HWC format for Holoviz, input is CHW format
     processed_dims.push_back(static_cast<int64_t>(dimensions[1]));
@@ -249,7 +250,7 @@ InferStatus DataProcessor::scale_intensity_cpu(const std::vector<int>& dimension
   }
 
   auto processed_data =
-      static_cast<uint8_t*>(processed_data_map.at(out_tensor_name)->host_buffer.data());
+      static_cast<uint8_t*>(processed_data_map.at(out_tensor_name)->host_buffer_->data());
   auto input_data = static_cast<const float*>(indata);
   float max = 0, min = 100000;
 
@@ -294,17 +295,18 @@ InferStatus DataProcessor::scale_intensity_cpu(const std::vector<int>& dimension
   return InferStatus();
 }
 
-InferStatus DataProcessor::compute_max_per_channel_cpu(
+InferStatus DataProcessor::compute_max_per_channel_scaled(
     const std::vector<int>& dimensions, const void* indata, std::vector<int64_t>& processed_dims,
-    DataMap& processed_data_map, const std::vector<std::string>& output_tensors) {
+    DataMap& processed_data_map, const std::vector<std::string>& output_tensors,
+    bool process_with_cuda, cudaStream_t cuda_stream) {
   if (output_tensors.size() == 0) {
     return InferStatus(holoinfer_code::H_ERROR,
-                       "Data processor, Output tensor size 0 in compute_max_per_channel.");
+                       "Data processor, Output tensor size 0 in compute_max_per_channel_scaled.");
   }
   if (dimensions.size() != 4) {
-    return InferStatus(
-        holoinfer_code::H_ERROR,
-        "Data processor, Input dimensions expected in NHWC format in compute_max_per_channel.");
+    return InferStatus(holoinfer_code::H_ERROR,
+                       "Data processor, Input dimensions expected in NHWC format in "
+                       "compute_max_per_channel_scaled.");
   }
   auto out_tensor_name = output_tensors[0];  // only one output tensor supported
   // Assuming NHWC format
@@ -314,49 +316,64 @@ InferStatus DataProcessor::compute_max_per_channel_cpu(
 
   if (processed_data_map.find(out_tensor_name) == processed_data_map.end()) {
     // By default, create the float data type.
-    HOLOSCAN_LOG_INFO("Allocating memory for {} in compute_max_per_channel", out_tensor_name);
-    processed_data_map.insert({out_tensor_name, std::make_shared<DataBuffer>()});
-
-    // allocate memory for the first time
-    if (processed_data_map.at(out_tensor_name)->host_buffer.size() == 0) {
-      // this is custom allocation for max per channel. (x, y)
-      processed_data_map.at(out_tensor_name)->host_buffer.resize(2 * out_channels);
+    HOLOSCAN_LOG_INFO("Allocating memory for {} in compute_max_per_channel_scaled",
+                      out_tensor_name);
+    const auto [db, success] =
+        processed_data_map.insert({out_tensor_name, std::make_shared<DataBuffer>()});
+
+    // this is custom allocation for max per channel. (x, y)
+    if (process_with_cuda) {
+      db->second->device_buffer_->resize(2 * out_channels);
+    } else {
+      db->second->host_buffer_->resize(2 * out_channels);
     }
     processed_dims.push_back(1);  // CHECK: if disabled, get_data_from_tensor fails.
     processed_dims.push_back(static_cast<int64_t>(2 * out_channels));
   }
 
-  auto outdata = processed_data_map.at(out_tensor_name)->host_buffer.data();
+  if (process_with_cuda) {
+    void* outdata = processed_data_map.at(out_tensor_name)->device_buffer_->data();
 
-  auto input_data = static_cast<const float*>(indata);
-  auto processed_data = static_cast<float*>(outdata);
-  std::vector<unsigned int> max_x_per_channel, max_y_per_channel;
-
-  max_x_per_channel.resize(out_channels, 0);
-  max_y_per_channel.resize(out_channels, 0);
-  std::vector<float> maxV(out_channels, -1999);
-
-  for (unsigned int i = 0; i < rows; i++) {
-    for (unsigned int j = 0; j < cols; j++) {
-      for (unsigned int c = 1; c < out_channels; c++) {
-        unsigned int index = i * cols * out_channels + j * out_channels + c;
-        float v1 = input_data[index];
-        if (maxV[c] < v1) {
-          maxV[c] = v1;
-          max_x_per_channel[c] = i;
-          max_y_per_channel[c] = j;
+    max_per_channel_scaled_cuda(rows,
+                                cols,
+                                out_channels,
+                                reinterpret_cast<const float*>(indata),
+                                reinterpret_cast<float*>(outdata),
+                                cuda_stream);
+  } else {
+    void* outdata = processed_data_map.at(out_tensor_name)->host_buffer_->data();
+
+    auto input_data = static_cast<const float*>(indata);
+    auto processed_data = static_cast<float*>(outdata);
+    std::vector<unsigned int> max_x_per_channel, max_y_per_channel;
+
+    max_x_per_channel.resize(out_channels, 0);
+    max_y_per_channel.resize(out_channels, 0);
+    std::vector<float> maxV(out_channels, -1999);
+
+    for (unsigned int i = 0; i < rows; i++) {
+      for (unsigned int j = 0; j < cols; j++) {
+        for (unsigned int c = 1; c < out_channels; c++) {
+          unsigned int index = i * cols * out_channels + j * out_channels + c;
+          float v1 = input_data[index];
+          if (maxV[c] < v1) {
+            maxV[c] = v1;
+            max_x_per_channel[c] = i;
+            max_y_per_channel[c] = j;
+          }
         }
       }
     }
-  }
 
-  for (unsigned int i = 0; i < out_channels; i++) {
-    processed_data[2 * i] = static_cast<float>(max_x_per_channel[i]) / static_cast<float>(rows);
-    processed_data[2 * i + 1] = static_cast<float>(max_y_per_channel[i]) / static_cast<float>(cols);
-  }
+    for (unsigned int i = 0; i < out_channels; i++) {
+      processed_data[2 * i] = static_cast<float>(max_x_per_channel[i]) / static_cast<float>(rows);
+      processed_data[2 * i + 1] =
+          static_cast<float>(max_y_per_channel[i]) / static_cast<float>(cols);
+    }
 
-  max_x_per_channel.clear();
-  max_y_per_channel.clear();
+    max_x_per_channel.clear();
+    max_y_per_channel.clear();
+  }
   return InferStatus();
 }
 
@@ -365,7 +382,8 @@ InferStatus DataProcessor::process_operation(const std::string& operation,
                                              std::vector<int64_t>& processed_dims,
                                              DataMap& processed_data_map,
                                              const std::vector<std::string>& output_tensors,
-                                             const std::vector<std::string>& custom_strings) {
+                                             const std::vector<std::string>& custom_strings,
+                                             bool process_with_cuda, cudaStream_t cuda_stream) {
   if (indata == nullptr) {
     return InferStatus(holoinfer_code::H_ERROR,
                        "Data processor, Operation " + operation + ", Invalid input buffer");
@@ -375,8 +393,14 @@ InferStatus DataProcessor::process_operation(const std::string& operation,
     return InferStatus(holoinfer_code::H_ERROR,
                        "Data processor, Operation " + operation + " not found in map");
   try {
-    return oper_to_fp_.at(operation)(
-        indims, indata, processed_dims, processed_data_map, output_tensors, custom_strings);
+    return oper_to_fp_.at(operation)(indims,
+                                     indata,
+                                     processed_dims,
+                                     processed_data_map,
+                                     output_tensors,
+                                     custom_strings,
+                                     process_with_cuda,
+                                     cuda_stream);
   } catch (...) {
     return InferStatus(holoinfer_code::H_ERROR,
                        "Data processor, Exception in running " + operation);
diff --git a/modules/holoinfer/src/process/data_processor.cu b/modules/holoinfer/src/process/data_processor.cu
new file mode 100644
index 00000000..5134407f
--- /dev/null
+++ b/modules/holoinfer/src/process/data_processor.cu
@@ -0,0 +1,99 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "data_processor.hpp"
+
+#include <unistd.h>
+
+#include <cub/device/device_reduce.cuh>
+
+namespace holoscan {
+namespace inference {
+
+/**
+ * This class implements an iterator which skips `step` elements between each iteration.
+ */
+class step_iterator : public std::iterator<std::random_access_iterator_tag, float> {
+  pointer cur_;
+  size_t step_;
+
+ public:
+  explicit step_iterator(pointer cur, size_t step) : cur_(cur), step_(step) {}
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance offset) const {
+    return cur_[offset * step_];
+  }
+};
+
+/**
+ * CUDA kernel normalizing the coordinates stored in the `key` member.
+ *
+ * @param rows
+ * @param cols
+ * @param channels
+ * @param d_argmax
+ * @param out
+ */
+static __global__ void normalize(size_t rows, size_t cols, size_t channels,
+                                 cub::KeyValuePair<int, float>* d_argmax, float* out) {
+  const uint index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index > channels) { return; }
+
+  const int src_index = d_argmax[index].key;
+  int row = src_index / cols;
+  int col = src_index - (row * cols);
+  out[index * 2 + 0] = (float)row / (float)rows;
+  out[index * 2 + 1] = (float)col / (float)cols;
+}
+
+void DataProcessor::max_per_channel_scaled_cuda(size_t rows, size_t cols, size_t channels,
+                                                const float* indata, float* outdata,
+                                                cudaStream_t cuda_stream) {
+  /// @todo This algorithm needs temporary storage, currently data processors are just functions
+  /// without state. This should be an object with state so we can avoid re-allocating the temporary
+  /// storage at each invocation.
+
+  // Allocate result storage
+  cub::KeyValuePair<int, float>* d_argmax = nullptr;
+  check_cuda(
+      cudaMallocAsync(&d_argmax, sizeof(cub::KeyValuePair<int, float>) * channels, cuda_stream));
+
+  // get temp storage size
+  void* d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, indata, d_argmax, rows * cols);
+
+  // Allocate temporary storage
+  check_cuda(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, cuda_stream));
+
+  for (size_t channel = 0; channel < channels; ++channel) {
+    step_iterator iterator((float*)(indata + channel), channels);
+    cub::DeviceReduce::ArgMax(
+        d_temp_storage, temp_storage_bytes, iterator, &d_argmax[channel], rows * cols, cuda_stream);
+  }
+
+  check_cuda(cudaFreeAsync(d_temp_storage, cuda_stream));
+
+  dim3 block(32, 1, 1);
+  dim3 grid((channels + block.x - 1) / block.x, 1, 1);
+  normalize<<<grid, block, 0, cuda_stream>>>(rows, cols, channels, d_argmax, outdata);
+
+  check_cuda(cudaFreeAsync(d_argmax, cuda_stream));
+}
+
+}  // namespace inference
+}  // namespace holoscan
diff --git a/modules/holoinfer/src/process/data_processor.hpp b/modules/holoinfer/src/process/data_processor.hpp
index be1e4417..82e9852e 100644
--- a/modules/holoinfer/src/process/data_processor.hpp
+++ b/modules/holoinfer/src/process/data_processor.hpp
@@ -14,8 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef _HOLOSCAN_DATA_PROCESSOR_H
-#define _HOLOSCAN_DATA_PROCESSOR_H
+#ifndef MODULES_HOLOINFER_SRC_PROCESS_DATA_PROCESSOR_HPP
+#define MODULES_HOLOINFER_SRC_PROCESS_DATA_PROCESSOR_HPP
 
 #include <bits/stdc++.h>
 #include <cstring>
@@ -39,10 +39,10 @@ namespace inference {
 /// Declaration of function callback used by DataProcessor. processor_FP is defined for operations
 /// with fixed (currently one) input and output size, and for operations that do not need any
 /// configuration
-using processor_FP =
-    std::function<InferStatus(const std::vector<int>&, const void*, std::vector<int64_t>&, DataMap&,
-                              const std::vector<std::string>& output_tensors,
-                              const std::vector<std::string>& custom_strings)>;
+using processor_FP = std::function<InferStatus(
+    const std::vector<int>&, const void*, std::vector<int64_t>&, DataMap&,
+    const std::vector<std::string>& output_tensors, const std::vector<std::string>& custom_strings,
+    bool process_with_cuda, cudaStream_t cuda_stream)>;
 
 // Declaration of function callback for transforms that need configuration (via a yaml file).
 // Transforms additionally support multiple inputs and outputs from the processing.
@@ -73,7 +73,7 @@ class DataProcessor {
   InferStatus initialize(const MultiMappings& process_operations, const std::string config_path);
 
   /**
-   * @brief Executes an operation via function callback. (Currently CPU based)
+   * @brief Executes an operation via function callback.
    *
    * @param operation Operation to perform. Refer to user docs for a list of supported operations
    * @param in_dims Dimension of the input tensor
@@ -82,13 +82,16 @@ class DataProcessor {
    * @param processed_data_map Output data map, that will be populated
    * @param output_tensors Tensor names to be populated in the out_data_map
    * @param custom_strings Strings to display for custom print operations
+   * @param process_with_cuda Flag defining if processing should be done with CUDA
+   * @param cuda_stream CUDA stream to use when procseeing is done with CUDA
    * @return InferStatus with appropriate code and message
    */
   InferStatus process_operation(const std::string& operation, const std::vector<int>& in_dims,
                                 const void* in_data, std::vector<int64_t>& processed_dims,
                                 DataMap& processed_data_map,
                                 const std::vector<std::string>& output_tensors,
-                                const std::vector<std::string>& custom_strings);
+                                const std::vector<std::string>& custom_strings,
+                                bool process_with_cuda, cudaStream_t cuda_stream);
 
   /**
    * @brief Executes a transform via function callback. (Currently CPU based)
@@ -107,7 +110,8 @@ class DataProcessor {
                                 DataMap& processed_data, DimType& processed_dims);
 
   /**
-   * @brief Computes max per channel in input data and scales it to [0, 1]. (CPU based)
+   * @brief Computes max per channel in input data and scales it to [0, 1]. (supports both GPU and
+   * CPU data)
    *
    * @param operation Operation to perform. Refer to user docs for a list of supported operations
    * @param in_dims Dimension of the input tensor
@@ -115,10 +119,13 @@ class DataProcessor {
    * @param out_dims Dimension of the output tensor
    * @param out_data_map Output data buffer map
    * @param output_tensors Output tensor names, used to populate out_data_map
+   * @param process_with_cuda Flag defining if processing should be done with CUDA
+   * @param cuda_stream CUDA stream to use when procseeing is done with CUDA
    */
-  InferStatus compute_max_per_channel_cpu(const std::vector<int>& in_dims, const void* in_data,
-                                          std::vector<int64_t>& out_dims, DataMap& out_data_map,
-                                          const std::vector<std::string>& output_tensors);
+  InferStatus compute_max_per_channel_scaled(const std::vector<int>& in_dims, const void* in_data,
+                                             std::vector<int64_t>& out_dims, DataMap& out_data_map,
+                                             const std::vector<std::string>& output_tensors,
+                                             bool process_with_cuda, cudaStream_t cuda_stream);
 
   /**
    * @brief Scales intensity using min-max values and histogram. (CPU based)
@@ -182,7 +189,7 @@ class DataProcessor {
   /// Operation is the key and its related implementation platform as the value.
   /// Operations are defined with fixed number of input and outputs. Currently one for each.
   inline static const std::map<std::string, holoinfer_data_processor> supported_compute_operations_{
-      {"max_per_channel_scaled", holoinfer_data_processor::h_HOST},
+      {"max_per_channel_scaled", holoinfer_data_processor::h_CUDA_AND_HOST},
       {"scale_intensity_cpu", holoinfer_data_processor::h_HOST}};
 
   /// Map defining supported transforms by DataProcessor Class.
@@ -210,29 +217,32 @@ class DataProcessor {
   /// Mapped function call for the function pointer of max_per_channel_scaled
   processor_FP max_per_channel_scaled_fp_ =
       [this](auto& in_dims, const void* in_data, std::vector<int64_t>& out_dims, DataMap& out_data,
-             auto& output_tensors, auto& custom_strings) {
-        return compute_max_per_channel_cpu(in_dims, in_data, out_dims, out_data, output_tensors);
+             auto& output_tensors, auto& custom_strings, bool process_with_cuda,
+             cudaStream_t cuda_stream) {
+        return compute_max_per_channel_scaled(
+            in_dims, in_data, out_dims, out_data, output_tensors, process_with_cuda, cuda_stream);
       };
 
   /// Mapped function call for the function pointer of scale_intensity_cpu
   processor_FP scale_intensity_cpu_fp_ = [this](auto& in_dims, const void* in_data,
                                                 std::vector<int64_t>& out_dims, DataMap& out_data,
-                                                auto& output_tensors, auto& custom_strings) {
+                                                auto& output_tensors, auto& custom_strings,
+                                                bool process_with_cuda, cudaStream_t cuda_stream) {
     return scale_intensity_cpu(in_dims, in_data, out_dims, out_data, output_tensors);
   };
 
   /// Mapped function call for the function pointer of print
-  processor_FP print_results_fp_ = [this](auto& in_dims, const void* in_data,
-                                          std::vector<int64_t>& out_dims, DataMap& out_data,
-                                          auto& output_tensors, auto& custom_strings) {
-    return print_results(in_dims, in_data);
-  };
+  processor_FP print_results_fp_ =
+      [this](auto& in_dims, const void* in_data, std::vector<int64_t>& out_dims, DataMap& out_data,
+             auto& output_tensors, auto& custom_strings, bool process_with_cuda,
+             cudaStream_t cuda_stream) { return print_results(in_dims, in_data); };
 
   /// Mapped function call for the function pointer of printing custom binary classification
   /// results
   processor_FP print_custom_binary_classification_fp_ =
       [this](auto& in_dims, const void* in_data, std::vector<int64_t>& out_dims, DataMap& out_data,
-             auto& output_tensors, auto& custom_strings) {
+             auto& output_tensors, auto& custom_strings, bool process_with_cuda,
+             cudaStream_t cuda_stream) {
         return print_custom_binary_classification(in_dims, in_data, custom_strings);
       };
 
@@ -240,16 +250,16 @@ class DataProcessor {
   /// results to the CSV file using the Data Exporter API.
   processor_FP export_binary_classification_to_csv_fp_ =
       [this](auto& in_dims, const void* in_data, std::vector<int64_t>& out_dims, DataMap& out_data,
-             auto& output_tensors, auto& custom_strings) {
+             auto& output_tensors, auto& custom_strings, bool process_with_cuda,
+             cudaStream_t cuda_stream) {
         return export_binary_classification_to_csv(in_dims, in_data, custom_strings);
       };
 
   /// Mapped function call for the function pointer of print int32
-  processor_FP print_results_i32_fp_ = [this](auto& in_dims, const void* in_data,
-                                              std::vector<int64_t>& out_dims, DataMap& out_data,
-                                              auto& output_tensors, auto& custom_strings) {
-    return print_results_int32(in_dims, in_data);
-  };
+  processor_FP print_results_i32_fp_ =
+      [this](auto& in_dims, const void* in_data, std::vector<int64_t>& out_dims, DataMap& out_data,
+             auto& output_tensors, auto& custom_strings, bool process_with_cuda,
+             cudaStream_t cuda_stream) { return print_results_int32(in_dims, in_data); };
 
   /// Map with supported operation as the key and related function pointer as value
   const std::map<std::string, processor_FP> oper_to_fp_{
@@ -272,6 +282,19 @@ class DataProcessor {
   const std::map<std::string, transforms_FP> transform_to_fp_{
       {"generate_boxes", generate_boxes_fp_}};
 
+  /**
+   * @brief Computes max per channel in input data and scales it to [0, 1], CUDA version
+   *
+   * @param rows rows
+   * @param cols Dimension of the input tensor
+   * @param channels Input data buffer
+   * @param indata Input data
+   * @param outdata Output data
+   * @param cuda_stream CUDA stream to use when procseeing is done with CUDA
+   */
+  void max_per_channel_scaled_cuda(size_t rows, size_t cols, size_t channels, const float* indata,
+                                   float* outdata, cudaStream_t cuda_stream);
+
   /// Configuration path
   std::string config_path_ = {};
 
@@ -281,4 +304,4 @@ class DataProcessor {
 }  // namespace inference
 }  // namespace holoscan
 
-#endif
+#endif /* MODULES_HOLOINFER_SRC_PROCESS_DATA_PROCESSOR_HPP */
diff --git a/modules/holoinfer/src/process/transforms/generate_boxes.cpp b/modules/holoinfer/src/process/transforms/generate_boxes.cpp
index 2462294f..584e5323 100644
--- a/modules/holoinfer/src/process/transforms/generate_boxes.cpp
+++ b/modules/holoinfer/src/process/transforms/generate_boxes.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -199,7 +199,7 @@ InferStatus GenerateBoxes::execute_mask(const std::map<std::string, void*>& inda
     processed_data.insert({key, std::make_shared<DataBuffer>()});
     processed_dims.insert({key, {{height, width, 4}}});
   }
-  processed_data.at(key)->host_buffer.resize(height * width * 4);
+  processed_data.at(key)->host_buffer_->resize(height * width * 4);
 
   float* scores = static_cast<float*>(indata.at(tensor_to_output_map.at("scores")));
   float* masks = static_cast<float*>(indata.at(tensor_to_output_map.at("masks")));
@@ -209,7 +209,7 @@ InferStatus GenerateBoxes::execute_mask(const std::map<std::string, void*>& inda
 
   size_t size_scores =
       accumulate(dims_scores.begin(), dims_scores.end(), 1, std::multiplies<size_t>());
-  auto buffer = reinterpret_cast<float*>(processed_data.at(key)->host_buffer.data());
+  auto buffer = reinterpret_cast<float*>(processed_data.at(key)->host_buffer_->data());
 
   for (int i = 0; i < size_scores; i++) {
     if (scores[i] > threshold) {
@@ -283,21 +283,22 @@ InferStatus GenerateBoxes::execute(const std::map<std::string, void*>& indata,
 
       if (processed_data.find(key) == processed_data.end()) {
         processed_data.insert({key, std::make_shared<DataBuffer>()});
-        processed_data.at(key)->host_buffer.resize(4);
+        processed_data.at(key)->host_buffer_->resize(4);
         processed_dims.insert({key, {{1, 2, 2}}});
 
         processed_data.insert({key_text, std::make_shared<DataBuffer>()});
-        processed_data.at(key_text)->host_buffer.resize(3);
+        processed_data.at(key_text)->host_buffer_->resize(3);
         processed_dims.insert({key_text, {{1, 1, 3}}});
       }
 
-      auto current_data = static_cast<float*>(processed_data.at(key)->host_buffer.data());
+      auto current_data = static_cast<float*>(processed_data.at(key)->host_buffer_->data());
       current_data[0] = 0;
       current_data[1] = 0;
       current_data[2] = 0;
       current_data[3] = 0;
 
-      auto current_data_text = static_cast<float*>(processed_data.at(key_text)->host_buffer.data());
+      auto current_data_text =
+          static_cast<float*>(processed_data.at(key_text)->host_buffer_->data());
       current_data_text[0] = 1.1;
       current_data_text[1] = 1.1;
       current_data_text[2] = 0.05;
@@ -351,12 +352,12 @@ InferStatus GenerateBoxes::execute(const std::map<std::string, void*>& indata,
 
       for (int i = 0; i < size_of_item; i++) {
         auto key = fmt::format("{}{}", obj_name, i);
-        std::memcpy(processed_data.at(key)->host_buffer.data(),
+        std::memcpy(processed_data.at(key)->host_buffer_->data(),
                     obj_boxes[i].data(),
                     obj_boxes[i].size() * sizeof(float));
 
         key = fmt::format("{}text{}", obj_name, i);
-        auto current_data_text = static_cast<float*>(processed_data.at(key)->host_buffer.data());
+        auto current_data_text = static_cast<float*>(processed_data.at(key)->host_buffer_->data());
         current_data_text[0] = obj_boxes[i][2];
         current_data_text[1] = obj_boxes[i][1];
       }
diff --git a/modules/holoinfer/src/utils/infer_buffer.cpp b/modules/holoinfer/src/utils/infer_buffer.cpp
index adb925b6..3c3305f7 100644
--- a/modules/holoinfer/src/utils/infer_buffer.cpp
+++ b/modules/holoinfer/src/utils/infer_buffer.cpp
@@ -61,8 +61,8 @@ InferStatus allocate_buffers(DataMap& buffers, std::vector<int64_t>& dims,
         fmt::format("Data buffer creation failed for {} with error {}", keyname, e.what()));
     return status;
   }
-  data_buffer->host_buffer.resize(buffer_size);
-  if (allocate_cuda) { data_buffer->device_buffer->resize(buffer_size); }
+  data_buffer->host_buffer_->resize(buffer_size);
+  if (allocate_cuda) { data_buffer->device_buffer_->resize(buffer_size); }
   buffers.insert({keyname, std::move(data_buffer)});
   return InferStatus();
 }
@@ -72,25 +72,30 @@ bool DeviceAllocator::operator()(void** ptr, size_t size) const {
 }
 
 void DeviceFree::operator()(void* ptr) const {
-  cudaFree(ptr);
+  if (ptr) { cudaFree(ptr); }
 }
 
 DataBuffer::DataBuffer(holoinfer_datatype data_type, int device_id)
-    : type_(data_type), device_id_(device_id) {
+    : data_type_(data_type) {
   try {
-    device_buffer = std::make_shared<DeviceBuffer>(type_);
+    device_buffer_ = std::make_shared<DeviceBuffer>(data_type_, device_id);
   } catch (std::exception& e) {
     throw std::runtime_error(
         fmt::format("Device buffer creation failed in DataBuffer constructor with {}", e.what()));
   }
-  host_buffer.set_type(type_);
+  try {
+    host_buffer_ = std::make_shared<HostBuffer>(data_type_);
+  } catch (std::exception& e) {
+    throw std::runtime_error(
+        fmt::format("Host buffer creation failed in DataBuffer constructor with {}", e.what()));
+  }
 }
 
-DeviceBuffer::DeviceBuffer(holoinfer_datatype type)
-    : size_(0), capacity_(0), type_(type), buffer_(nullptr) {}
+DeviceBuffer::DeviceBuffer(holoinfer_datatype type, int device_id)
+    : Buffer(type, device_id), size_(0), capacity_(0), buffer_(nullptr) {}
 
 DeviceBuffer::DeviceBuffer(size_t size, holoinfer_datatype type)
-    : size_(size), capacity_(size), type_(type) {
+    : Buffer(type), size_(size), capacity_(size) {
   if (!allocator_(&buffer_, this->get_bytes())) { throw std::bad_alloc(); }
 }
 
@@ -119,5 +124,30 @@ DeviceBuffer::~DeviceBuffer() {
   free_(buffer_);
 }
 
+void* HostBuffer::data() {
+  return static_cast<void*>(buffer_.data());
+}
+
+size_t HostBuffer::size() const {
+  return number_of_elements_;
+}
+
+size_t HostBuffer::get_bytes() const {
+  return buffer_.size();
+}
+
+void HostBuffer::set_type(holoinfer_datatype in_type) {
+  type_ = in_type;
+  resize(size());
+}
+
+void HostBuffer::resize(size_t number_of_elements) {
+  if (number_of_elements != number_of_elements_) {
+    buffer_.clear();
+    number_of_elements_ = number_of_elements;
+    buffer_.resize(number_of_elements * get_element_size(type_));
+  }
+}
+
 }  // namespace inference
 }  // namespace holoscan
diff --git a/modules/holoviz/examples/demo/Main.cpp b/modules/holoviz/examples/demo/Main.cpp
index c0e209ff..1486d1e3 100644
--- a/modules/holoviz/examples/demo/Main.cpp
+++ b/modules/holoviz/examples/demo/Main.cpp
@@ -75,15 +75,15 @@ bool show_ui = true;
 bool show_image_layer = true;
 Source current_source = Source::DEVICE;
 uint32_t current_format_index = 2;
-float image_layer_opacity = 1.f;
+float image_layer_opacity = 1.F;
 int image_layer_priority = 0;
 
 bool show_geometry_layer = true;
-float geometry_layer_opacity = 1.f;
+float geometry_layer_opacity = 1.F;
 int geometry_layer_priority = 1;
 
 bool show_geometry_3d_layer = true;
-float geometry_3d_layer_opacity = 1.f;
+float geometry_3d_layer_opacity = 1.F;
 int geometry_3d_layer_priority = 2;
 
 uint32_t width = 1920;
@@ -93,7 +93,7 @@ uint32_t height = 1080;
 std::chrono::steady_clock::time_point start;
 std::chrono::milliseconds elapsed;
 uint32_t iterations = 0;
-float fps = 0.f;
+float fps = 0.F;
 
 // cuda
 CUcontext cuda_context = nullptr;
@@ -131,7 +131,7 @@ void tick() {
                    reinterpret_cast<int*>(&current_format_index),
                    format_items,
                    IM_ARRAYSIZE(format_items));
-      ImGui::SliderFloat("Opacity##image", &image_layer_opacity, 0.f, 1.f);
+      ImGui::SliderFloat("Opacity##image", &image_layer_opacity, 0.F, 1.F);
       ImGui::SliderInt("Priority##image", &image_layer_priority, -10, 10);
 
       // color picker for first item of LUT
@@ -140,29 +140,29 @@ void tick() {
         ImGui::SliderInt("LUT index", &color_index, 0, palette.size() - 1);
 
         uint32_t& item = palette[color_index];
-        float color[]{(item & 0xFF) / 255.f,
-                      ((item >> 8) & 0xFF) / 255.f,
-                      ((item >> 16) & 0xFF) / 255.f,
-                      ((item >> 24) & 0xFF) / 255.f};
+        float color[]{(item & 0xFF) / 255.F,
+                      ((item >> 8) & 0xFF) / 255.F,
+                      ((item >> 16) & 0xFF) / 255.F,
+                      ((item >> 24) & 0xFF) / 255.F};
         ImGui::ColorEdit4("##color", color, ImGuiColorEditFlags_DefaultOptions_);
-        item = static_cast<uint32_t>((color[0] * 255.f) + 0.5f) +
-               (static_cast<uint32_t>((color[1] * 255.f) + 0.5f) << 8) +
-               (static_cast<uint32_t>((color[2] * 255.f) + 0.5f) << 16) +
-               (static_cast<uint32_t>((color[3] * 255.f) + 0.5f) << 24);
+        item = static_cast<uint32_t>((color[0] * 255.F) + 0.5F) +
+               (static_cast<uint32_t>((color[1] * 255.F) + 0.5F) << 8) +
+               (static_cast<uint32_t>((color[2] * 255.F) + 0.5F) << 16) +
+               (static_cast<uint32_t>((color[3] * 255.F) + 0.5F) << 24);
       }
     }
 
     ImGui::Separator();
     ImGui::Checkbox("Geometry layer", &show_geometry_layer);
     if (show_geometry_layer) {
-      ImGui::SliderFloat("Opacity##geom", &geometry_layer_opacity, 0.f, 1.f);
+      ImGui::SliderFloat("Opacity##geom", &geometry_layer_opacity, 0.F, 1.F);
       ImGui::SliderInt("Priority##geom", &geometry_layer_priority, -10, 10);
     }
 
     ImGui::Separator();
     ImGui::Checkbox("3D Geometry layer", &show_geometry_3d_layer);
     if (show_geometry_3d_layer) {
-      ImGui::SliderFloat("Opacity##geom3d", &geometry_3d_layer_opacity, 0.f, 1.f);
+      ImGui::SliderFloat("Opacity##geom3d", &geometry_3d_layer_opacity, 0.F, 1.F);
       ImGui::SliderInt("Priority##geom3d", &geometry_3d_layer_priority, -10, 10);
     }
 
@@ -216,61 +216,61 @@ void tick() {
     viz::LayerOpacity(geometry_layer_opacity);
     viz::LayerPriority(geometry_layer_priority);
 
-    const float text_size = 0.05f;
+    const float text_size = 0.05F;
 
-    viz::Color(1.f, 0.f, 0.f, 1.f);
-    viz::Text(0.65f, 0.05f, text_size, "POINT_LIST");
+    viz::Color(1.F, 0.F, 0.F, 1.F);
+    viz::Text(0.65F, 0.05F, text_size, "POINT_LIST");
     {
-      const float data[]{0.9f, 0.1f, 0.95f, 0.05f};
-      viz::PointSize(5.f);
+      const float data[]{0.9F, 0.1F, 0.95F, 0.05F};
+      viz::PointSize(5.F);
       viz::Primitive(viz::PrimitiveTopology::POINT_LIST, 2, sizeof(data) / sizeof(data[0]), data);
     }
 
-    viz::Color(0.f, 1.f, 0.f, 1.f);
-    viz::Text(0.65f, 0.2f, text_size, "LINE_LIST");
+    viz::Color(0.F, 1.F, 0.F, 1.F);
+    viz::Text(0.65F, 0.2F, text_size, "LINE_LIST");
     {
-      const float data[]{0.9f, 0.25f, 0.95f, 0.2f, 0.92f, 0.27f, 0.93f, 0.23f};
-      viz::LineWidth(2.f);
+      const float data[]{0.9F, 0.25F, 0.95F, 0.2F, 0.92F, 0.27F, 0.93F, 0.23F};
+      viz::LineWidth(2.F);
       viz::Primitive(viz::PrimitiveTopology::LINE_LIST, 2, sizeof(data) / sizeof(data[0]), data);
     }
 
-    viz::Color(1.f, 1.f, 0.f, 1.f);
-    viz::Text(0.65f, 0.35f, text_size, "LINE_STRIP");
+    viz::Color(1.F, 1.F, 0.F, 1.F);
+    viz::Text(0.65F, 0.35F, text_size, "LINE_STRIP");
     {
-      const float data[]{0.9f, 0.35f, 0.95f, 0.3f, 0.97f, 0.37f, 0.93f, 0.35f};
-      viz::LineWidth(1.f);
+      const float data[]{0.9F, 0.35F, 0.95F, 0.3F, 0.97F, 0.37F, 0.93F, 0.35F};
+      viz::LineWidth(1.F);
       viz::Primitive(viz::PrimitiveTopology::LINE_STRIP, 3, sizeof(data) / sizeof(data[0]), data);
     }
 
-    viz::Color(0.f, 0.f, 1.f, 1.f);
-    viz::Text(0.65f, 0.5f, text_size, "TRIANGLE_LIST");
+    viz::Color(0.F, 0.F, 1.F, 1.F);
+    viz::Text(0.65F, 0.5F, text_size, "TRIANGLE_LIST");
     {
       const float data[]{
-          0.9f, 0.45f, 0.92f, 0.45f, 0.91f, 0.5f, 0.95f, 0.45f, 0.95f, 0.55f, 0.975f, 0.50f};
+          0.9F, 0.45F, 0.92F, 0.45F, 0.91F, 0.5F, 0.95F, 0.45F, 0.95F, 0.55F, 0.975F, 0.50F};
       viz::Primitive(
           viz::PrimitiveTopology::TRIANGLE_LIST, 2, sizeof(data) / sizeof(data[0]), data);
     }
 
-    viz::Color(1.f, 0.f, 1.f, 1.f);
-    viz::Text(0.65f, 0.65f, text_size, "CROSS_LIST");
+    viz::Color(1.F, 0.F, 1.F, 1.F);
+    viz::Text(0.65F, 0.65F, text_size, "CROSS_LIST");
     {
-      const float data[]{0.9f, 0.7f, 0.08f, 0.95f, 0.65f, 0.05f};
+      const float data[]{0.9F, 0.7F, 0.08F, 0.95F, 0.65F, 0.05F};
       viz::Primitive(viz::PrimitiveTopology::CROSS_LIST, 2, sizeof(data) / sizeof(data[0]), data);
     }
 
-    viz::Color(0.f, 1.f, 1.f, 1.f);
-    viz::Text(0.65f, 0.8f, text_size, "RECTANGLE_LIST");
+    viz::Color(0.F, 1.F, 1.F, 1.F);
+    viz::Text(0.65F, 0.8F, text_size, "RECTANGLE_LIST");
     {
-      const float data[]{0.9f, 0.75f, 0.98f, 0.85f, 0.95f, 0.8f, 0.97f, 0.83f};
+      const float data[]{0.9F, 0.75F, 0.98F, 0.85F, 0.95F, 0.8F, 0.97F, 0.83F};
       viz::Primitive(
           viz::PrimitiveTopology::RECTANGLE_LIST, 2, sizeof(data) / sizeof(data[0]), data);
     }
 
-    viz::Color(1.f, 1.f, 1.f, 1.f);
-    viz::Text(0.65f, 0.95f, text_size, "OVAL_LIST");
+    viz::Color(1.F, 1.F, 1.F, 1.F);
+    viz::Text(0.65F, 0.95F, text_size, "OVAL_LIST");
     {
-      const float data[]{0.9f, 0.95f, 0.1f, 0.1f, 0.95f, 0.975f, 0.05f, 0.1f};
-      viz::LineWidth(3.f);
+      const float data[]{0.9F, 0.95F, 0.1F, 0.1F, 0.95F, 0.975F, 0.05F, 0.1F};
+      viz::LineWidth(3.F);
       viz::Primitive(viz::PrimitiveTopology::OVAL_LIST, 2, sizeof(data) / sizeof(data[0]), data);
     }
 
@@ -283,12 +283,12 @@ void tick() {
     viz::LayerPriority(geometry_3d_layer_priority);
 
     {
-      const float x_min = -0.25f;
-      const float x_max = 0.25f;
-      const float y_min = -0.25f;
-      const float y_max = 0.25f;
-      const float z_min = -0.25f;
-      const float z_max = 0.25f;
+      const float x_min = -0.25F;
+      const float x_max = 0.25F;
+      const float y_min = -0.25F;
+      const float y_max = 0.25F;
+      const float z_min = -0.25F;
+      const float z_max = 0.25F;
       const float data[]{
           x_min, y_min, z_min, x_max, y_min, z_min, x_min, y_max, z_min, x_max, y_max, z_min,
           x_min, y_min, z_max, x_max, y_min, z_max, x_min, y_max, z_max, x_max, y_max, z_max,
@@ -296,20 +296,20 @@ void tick() {
           x_min, y_max, z_max, x_min, y_min, z_max, x_max, y_max, z_max, x_max, y_min, z_max,
           x_min, y_max, z_min, x_min, y_max, z_max, x_min, y_min, z_min, x_min, y_min, z_max,
           x_max, y_max, z_min, x_max, y_max, z_max, x_max, y_min, z_min, x_max, y_min, z_max};
-      viz::Color(0.75f, 0.f, 0.25f, 1.f);
-      viz::LineWidth(4.f);
+      viz::Color(0.75F, 0.F, 0.25F, 1.F);
+      viz::LineWidth(4.F);
       viz::Primitive(
           viz::PrimitiveTopology::LINE_LIST_3D, 12, sizeof(data) / sizeof(data[0]), data);
 
-      viz::Color(0.f, 1.f, 0.f, 1.f);
-      viz::PointSize(6.f);
+      viz::Color(0.F, 1.F, 0.F, 1.F);
+      viz::PointSize(6.F);
       viz::Primitive(
           viz::PrimitiveTopology::POINT_LIST_3D, 24, sizeof(data) / sizeof(data[0]), data);
     }
     {
-      const float data[]{-0.125f, -0.125f, 0.f, 0.125f, -0.125f, 0.f, 0.125f, 0.125f, 0.f};
+      const float data[]{-0.125F, -0.125F, 0.F, 0.125F, -0.125F, 0.F, 0.125F, 0.125F, 0.F};
 
-      viz::Color(0.f, 0.f, 1.f, 1.f);
+      viz::Color(0.F, 0.F, 1.F, 1.F);
       viz::Primitive(
           viz::PrimitiveTopology::TRIANGLE_LIST_3D, 1, sizeof(data) / sizeof(data[0]), data);
     }
@@ -324,7 +324,7 @@ void tick() {
   elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() -
                                                                   start);
   if (!benchmark_mode && (elapsed.count() > 1000)) {
-    fps = static_cast<float>(iterations) / (static_cast<float>(elapsed.count()) / 1000.f);
+    fps = static_cast<float>(iterations) / (static_cast<float>(elapsed.count()) / 1000.F);
     start = std::chrono::steady_clock::now();
     iterations = 0;
   }
@@ -420,12 +420,12 @@ void loadImage() {
       dst_r8 += 1;
 
       // BT.601 full range RGB -> YUV
-      dst_y8[0] = (0.f + (0.299f * r) + (0.587f * g) + (0.114 * b)) + 0.5f;
+      dst_y8[0] = (0.F + (0.299F * r) + (0.587F * g) + (0.114F * b)) + 0.5F;
       dst_y8 += 1;
 
       if (!(x & 1) && !(y & 1)) {
-        dst_u8v8[0] = (128.f - (0.168736f * r) - (0.331264f * g) + (0.5f * b)) + 0.5f;
-        dst_u8v8[1] = (128.f + (0.5f * r) - (0.418688f * g) - (0.081312f * b)) + 0.5f;
+        dst_u8v8[0] = (128.F - (0.168736F * r) - (0.331264F * g) + (0.5F * b)) + 0.5F;
+        dst_u8v8[1] = (128.F + (0.5F * r) - (0.418688F * g) - (0.081312F * b)) + 0.5F;
         dst_u8v8 += 2;
       }
 
@@ -497,7 +497,8 @@ int main(int argc, char** argv) {
   // parse options
   while (true) {
     int option_index = 0;
-    const int c = getopt_long(argc, argv, "hblfed:p:c:", long_options, &option_index);
+    const int c =
+        getopt_long(argc, argv, "hblfed:p:c:", static_cast<option*>(long_options), &option_index);
 
     if (c == -1) { break; }
 
@@ -610,7 +611,7 @@ int main(int argc, char** argv) {
       if (fullscreen) { flags = viz::InitFlags::FULLSCREEN; }
 
       display_width = 1024;
-      display_height = uint32_t(static_cast<float>(height) / static_cast<float>(width) * 1024.f);
+      display_height = uint32_t(static_cast<float>(height) / static_cast<float>(width) * 1024.F);
       viz::Init(display_width,
                 display_height,
                 "Holoviz Example",
@@ -644,7 +645,7 @@ int main(int argc, char** argv) {
           start = std::chrono::steady_clock::time_point();
           do { tick(); } while (elapsed.count() < 2000);
           std::cout << current_source << " " << format_items[current_format_index] << " "
-                    << float(iterations) / (float(elapsed.count()) / 1000.f) << " fps" << std::endl;
+                    << float(iterations) / (float(elapsed.count()) / 1000.F) << " fps" << std::endl;
         }
       }
     } else if (headless_mode) {
diff --git a/modules/holoviz/examples/depth_map/Main.cpp b/modules/holoviz/examples/depth_map/Main.cpp
index fccaa278..463b3f26 100644
--- a/modules/holoviz/examples/depth_map/Main.cpp
+++ b/modules/holoviz/examples/depth_map/Main.cpp
@@ -63,10 +63,10 @@ uint32_t current_format_index = 0;
 
 bool show_ui = true;
 bool unlimited_fps = false;
-float fps = 15.f;
+float fps = 15.F;
 int color_index = 0;
-float line_width = 1.f;
-float point_size = 1.f;
+float line_width = 1.F;
+float point_size = 1.F;
 
 // cuda
 CUcontext cuda_context = nullptr;
@@ -85,9 +85,9 @@ void loadImage(const std::string& filename, CUdeviceptr* cu_device_mem, int* wid
     tmp_image_data.reserve(*width * *height);
     for (int index = 0; index < *width * *height; ++index) {
       const uint8_t* src = &file_image_data[index * *components];
-      tmp_image_data.push_back(static_cast<uint8_t>(0.2126f * static_cast<float>(src[0]) +
-                                                    0.7152f * static_cast<float>(src[1]) +
-                                                    0.0722f * static_cast<float>(src[2]) + 0.5f));
+      tmp_image_data.push_back(static_cast<uint8_t>(0.2126F * static_cast<float>(src[0]) +
+                                                    0.7152F * static_cast<float>(src[1]) +
+                                                    0.0722F * static_cast<float>(src[2]) + 0.5F));
     }
     image_data = tmp_image_data.data();
     *components = 1;
@@ -176,13 +176,13 @@ void generateSourceData() {
     case viz::ImageFormat::R8_UNORM:
       element_size = sizeof(uint8_t);
       write_depth = [](void* data, size_t index, float value) {
-        reinterpret_cast<uint8_t*>(data)[index] = value * 63.f;
+        reinterpret_cast<uint8_t*>(data)[index] = value * 63.F;
       };
       break;
     case viz::ImageFormat::D32_SFLOAT:
       element_size = sizeof(float);
       write_depth = [](void* data, size_t index, float value) {
-        reinterpret_cast<float*>(data)[index] = value / 4.f;
+        reinterpret_cast<float*>(data)[index] = value / 4.F;
       };
       break;
     default:
@@ -213,13 +213,13 @@ void generateSourceData() {
 
     for (uint32_t y = 0; y < map_height; ++y) {
       for (uint32_t x = 0; x < map_width; ++x) {
-        const float depth = (std::sin((float(x) / float(map_width)) * 3.14f * 4.f) *
-                                 std::cos((float(y) / float(map_height)) * 3.14f * 3.f) +
-                             1.f) *
+        const float depth = (std::sin((float(x) / float(map_width)) * 3.14F * 4.F) *
+                                 std::cos((float(y) / float(map_height)) * 3.14F * 3.F) +
+                             1.F) *
                             offset;
 
         write_depth(depth_data.get(), y * map_width + x, depth);
-        const uint8_t color = depth * 63.f;
+        const uint8_t color = depth * 63.F;
         color_data[y * map_width + x] = color | ((color << (8 + (x & 1))) & 0xFF00) |
                                         ((color << (16 + (y & 1) * 2)) & 0xFF0000) | 0xFF204060;
       }
@@ -283,23 +283,23 @@ void tick() {
 
           {
             uint32_t& item = palette[color_index];
-            float color[]{(item & 0xFF) / 255.f,
-                          ((item >> 8) & 0xFF) / 255.f,
-                          ((item >> 16) & 0xFF) / 255.f,
-                          ((item >> 24) & 0xFF) / 255.f};
+            float color[]{(item & 0xFF) / 255.F,
+                          ((item >> 8) & 0xFF) / 255.F,
+                          ((item >> 16) & 0xFF) / 255.F,
+                          ((item >> 24) & 0xFF) / 255.F};
             ImGui::ColorEdit4("##color", color, ImGuiColorEditFlags_DefaultOptions_);
-            item = static_cast<uint32_t>((color[0] * 255.f) + 0.5f) +
-                   (static_cast<uint32_t>((color[1] * 255.f) + 0.5f) << 8) +
-                   (static_cast<uint32_t>((color[2] * 255.f) + 0.5f) << 16) +
-                   (static_cast<uint32_t>((color[3] * 255.f) + 0.5f) << 24);
+            item = static_cast<uint32_t>((color[0] * 255.F) + 0.5F) +
+                   (static_cast<uint32_t>((color[1] * 255.F) + 0.5F) << 8) +
+                   (static_cast<uint32_t>((color[2] * 255.F) + 0.5F) << 16) +
+                   (static_cast<uint32_t>((color[3] * 255.F) + 0.5F) << 24);
           }
         }
         break;
       case RenderMode::LINES:
-        ImGui::SliderFloat("Line width", &line_width, 1.f, 20.f);
+        ImGui::SliderFloat("Line width", &line_width, 1.F, 20.F);
         break;
       case RenderMode::POINTS:
-        ImGui::SliderFloat("Point size", &point_size, 1.f, 20.f);
+        ImGui::SliderFloat("Point size", &point_size, 1.F, 20.F);
         break;
     }
 
@@ -435,7 +435,8 @@ int main(int argc, char** argv) {
   // parse options
   while (true) {
     int option_index = 0;
-    const int c = getopt_long(argc, argv, "hd:c:bl", long_options, &option_index);
+    const int c =
+        getopt_long(argc, argv, "hd:c:bl", static_cast<option*>(long_options), &option_index);
 
     if (c == -1) { break; }
 
@@ -501,7 +502,7 @@ int main(int argc, char** argv) {
 
             std::cout << size << " " << format_items[current_format_index] << " "
                       << render_mode_items[int(current_render_mode)] << " "
-                      << float(iterations) / (float(elapsed.count()) / 1000.f) << " fps"
+                      << float(iterations) / (float(elapsed.count()) / 1000.F) << " fps"
                       << std::endl;
           }
         }
@@ -511,7 +512,7 @@ int main(int argc, char** argv) {
         if (!viz::WindowIsMinimized()) {
           tick();
           if (!unlimited_fps) {
-            std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(1000.f / fps));
+            std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(1000.F / fps));
           }
         }
       }
diff --git a/modules/holoviz/src/context.cpp b/modules/holoviz/src/context.cpp
index dc589f9c..3e1173ba 100644
--- a/modules/holoviz/src/context.cpp
+++ b/modules/holoviz/src/context.cpp
@@ -149,7 +149,7 @@ class Context::Impl {
   std::optional<SurfaceFormat> surface_format_;
   CUstream cuda_stream_ = 0;
   std::string font_path_;
-  float font_size_in_pixels_ = 0.f;
+  float font_size_in_pixels_ = 0.F;
   PresentMode present_mode_ = PresentMode::AUTO;
 
   std::unique_ptr<Window> window_;
diff --git a/modules/holoviz/src/cuda/gen_depth_map.cu b/modules/holoviz/src/cuda/gen_depth_map.cu
index 5414243a..b77ce4fe 100644
--- a/modules/holoviz/src/cuda/gen_depth_map.cu
+++ b/modules/holoviz/src/cuda/gen_depth_map.cu
@@ -36,10 +36,10 @@ __global__ void GenDepthMapCoordsKernel(uint32_t width, uint32_t height, float i
 
   dst += offset * 3;
 
-  dst[0] = float(launch_index.x) * inv_width - 0.5f;
-  dst[1] = float(launch_index.y) * inv_height - 0.5f;
+  dst[0] = float(launch_index.x) * inv_width - 0.5F;
+  dst[1] = float(launch_index.y) * inv_height - 0.5F;
   if constexpr (std::is_same<T, uint8_t>::value) {
-    dst[2] = float(src[offset]) / 255.f;
+    dst[2] = float(src[offset]) / 255.F;
   } else if constexpr (std::is_same<T, float>::value) {
     dst[2] = src[offset];
   }
@@ -96,8 +96,8 @@ void GenDepthMapCoords(ImageFormat depth_format, uint32_t width, uint32_t height
   const dim3 launch_grid((width + (block_dim.x - 1)) / block_dim.x,
                          (height + (block_dim.y - 1)) / block_dim.y);
 
-  const float inv_width = 1.f / float(width);
-  const float inv_height = 1.f / float(height);
+  const float inv_width = 1.F / float(width);
+  const float inv_height = 1.F / float(height);
 
   switch (depth_format) {
     case ImageFormat::R8_UNORM:
diff --git a/modules/holoviz/src/exclusive_window.cpp b/modules/holoviz/src/exclusive_window.cpp
index 8ac9a307..e1f396b0 100644
--- a/modules/holoviz/src/exclusive_window.cpp
+++ b/modules/holoviz/src/exclusive_window.cpp
@@ -155,12 +155,12 @@ vk::SurfaceKHR ExclusiveWindow::create_surface(vk::PhysicalDevice physical_devic
     HOLOSCAN_LOG_WARN("Did not find a display mode with the desired properties {}x{} {:.3f} Hz",
                       impl_->desired_width_,
                       impl_->desired_height_,
-                      static_cast<float>(impl_->desired_refresh_rate_) / 1000.f);
+                      static_cast<float>(impl_->desired_refresh_rate_) / 1000.F);
   }
   HOLOSCAN_LOG_INFO("Using display mode {}x{} {:.3f} Hz",
                     mode_properties.parameters.visibleRegion.width,
                     mode_properties.parameters.visibleRegion.height,
-                    static_cast<float>(mode_properties.parameters.refreshRate) / 1000.f);
+                    static_cast<float>(mode_properties.parameters.refreshRate) / 1000.F);
 
   impl_->width_ = mode_properties.parameters.visibleRegion.width;
   impl_->height_ = mode_properties.parameters.visibleRegion.height;
@@ -213,7 +213,7 @@ vk::SurfaceKHR ExclusiveWindow::create_surface(vk::PhysicalDevice physical_devic
   surface_create_info.planeIndex = plane_index;
   surface_create_info.planeStackIndex = planes[plane_index].currentStackIndex;
   surface_create_info.transform = vk::SurfaceTransformFlagBitsKHR::eIdentity;
-  surface_create_info.globalAlpha = 1.0f;
+  surface_create_info.globalAlpha = 1.0F;
   surface_create_info.alphaMode = selected_alpha_mode;
   surface_create_info.imageExtent = vk::Extent2D{mode_properties.parameters.visibleRegion.width,
                                                  mode_properties.parameters.visibleRegion.height};
@@ -232,7 +232,7 @@ bool ExclusiveWindow::is_minimized() {
 void ExclusiveWindow::im_gui_new_frame() {
   ImGuiIO& io = ImGui::GetIO();
   io.DisplaySize = ImVec2(static_cast<float>(impl_->width_), static_cast<float>(impl_->height_));
-  io.DisplayFramebufferScale = ImVec2(1.f, 1.f);
+  io.DisplayFramebufferScale = ImVec2(1.F, 1.F);
 
   ImGui::NewFrame();
 }
diff --git a/modules/holoviz/src/glfw_window.cpp b/modules/holoviz/src/glfw_window.cpp
index 38b2ee1a..8bbab6f9 100644
--- a/modules/holoviz/src/glfw_window.cpp
+++ b/modules/holoviz/src/glfw_window.cpp
@@ -539,7 +539,7 @@ float GLFWWindow::get_aspect_ratio() {
   if (impl_->framebuffer_height_) {
     return float(impl_->framebuffer_width_) / float(impl_->framebuffer_height_);
   } else {
-    return 1.f;
+    return 1.F;
   }
 }
 
diff --git a/modules/holoviz/src/headless_window.cpp b/modules/holoviz/src/headless_window.cpp
index 278f6b6b..1ccf3d57 100644
--- a/modules/holoviz/src/headless_window.cpp
+++ b/modules/holoviz/src/headless_window.cpp
@@ -83,7 +83,7 @@ bool HeadlessWindow::is_minimized() {
 void HeadlessWindow::im_gui_new_frame() {
   ImGuiIO& io = ImGui::GetIO();
   io.DisplaySize = ImVec2(static_cast<float>(impl_->width_), static_cast<float>(impl_->height_));
-  io.DisplayFramebufferScale = ImVec2(1.f, 1.f);
+  io.DisplayFramebufferScale = ImVec2(1.F, 1.F);
 
   ImGui::NewFrame();
 }
diff --git a/modules/holoviz/src/holoviz/color_space.hpp b/modules/holoviz/src/holoviz/color_space.hpp
index 775c7d43..0015f429 100644
--- a/modules/holoviz/src/holoviz/color_space.hpp
+++ b/modules/holoviz/src/holoviz/color_space.hpp
@@ -20,7 +20,6 @@
 
 #include <cstdint>
 
-#include "holoviz/color_space.hpp"
 #include "holoviz/image_format.hpp"
 
 namespace holoscan::viz {
diff --git a/modules/holoviz/src/layers/geometry_layer.cpp b/modules/holoviz/src/layers/geometry_layer.cpp
index 196814c4..50ebacfe 100644
--- a/modules/holoviz/src/layers/geometry_layer.cpp
+++ b/modules/holoviz/src/layers/geometry_layer.cpp
@@ -43,7 +43,7 @@ constexpr uint32_t CIRCLE_SEGMENTS = 32;
 
 class Attributes {
  public:
-  Attributes() : color_({1.f, 1.f, 1.f, 1.f}), line_width_(1.f), point_size_(1.f) {}
+  Attributes() : color_({1.F, 1.F, 1.F, 1.F}), line_width_(1.F), point_size_(1.F) {}
 
   bool operator==(const Attributes& rhs) const {
     return ((color_ == rhs.color_) && (line_width_ == rhs.line_width_) &&
@@ -187,7 +187,7 @@ class GeometryLayer::Impl {
   std::list<class DepthMap> depth_maps_;
 
   // internal state
-  float aspect_ratio_ = 1.f;
+  float aspect_ratio_ = 1.F;
 
   size_t vertex_count_ = 0;
   std::unique_ptr<Buffer> vertex_buffer_;
@@ -385,7 +385,7 @@ void GeometryLayer::end(Vulkan* vulkan) {
             for (uint32_t index = 0; index < primitive.data_.size() / 2; ++index) {
               vertices.insert(
                   vertices.end(),
-                  {primitive.data_[index * 2 + 0], primitive.data_[index * 2 + 1], 0.f});
+                  {primitive.data_[index * 2 + 0], primitive.data_[index * 2 + 1], 0.F});
             }
             break;
           case PrimitiveTopology::CROSS_LIST:
@@ -393,10 +393,10 @@ void GeometryLayer::end(Vulkan* vulkan) {
             for (uint32_t index = 0; index < primitive.primitive_count_; ++index) {
               const float x = primitive.data_[index * 3 + 0];
               const float y = primitive.data_[index * 3 + 1];
-              const float sy = primitive.data_[index * 3 + 2] * 0.5f;
+              const float sy = primitive.data_[index * 3 + 2] * 0.5F;
               const float sx = sy / impl_->aspect_ratio_;
               vertices.insert(vertices.end(),
-                              {x - sx, y, 0.f, x + sx, y, 0.f, x, y - sy, 0.f, x, y + sy, 0.f});
+                              {x - sx, y, 0.F, x + sx, y, 0.F, x, y - sy, 0.F, x, y + sy, 0.F});
             }
             break;
           case PrimitiveTopology::RECTANGLE_LIST:
@@ -407,20 +407,20 @@ void GeometryLayer::end(Vulkan* vulkan) {
               const float x1 = primitive.data_[index * 4 + 2];
               const float y1 = primitive.data_[index * 4 + 3];
               vertices.insert(vertices.end(),
-                              {x0, y0, 0.f, x1, y0, 0.f, x1, y1, 0.f, x0, y1, 0.f, x0, y0, 0.f});
+                              {x0, y0, 0.F, x1, y0, 0.F, x1, y1, 0.F, x0, y1, 0.F, x0, y0, 0.F});
             }
             break;
           case PrimitiveTopology::OVAL_LIST:
             for (uint32_t index = 0; index < primitive.primitive_count_; ++index) {
               const float x = primitive.data_[index * 4 + 0];
               const float y = primitive.data_[index * 4 + 1];
-              const float rx = primitive.data_[index * 4 + 2] * 0.5f;
-              const float ry = primitive.data_[index * 4 + 3] * 0.5f;
+              const float rx = primitive.data_[index * 4 + 2] * 0.5F;
+              const float ry = primitive.data_[index * 4 + 3] * 0.5F;
               for (uint32_t segment = 0; segment <= CIRCLE_SEGMENTS; ++segment) {
-                const float rad = (2.f * M_PI) / CIRCLE_SEGMENTS * segment;
+                const float rad = (2.F * M_PI) / CIRCLE_SEGMENTS * segment;
                 const float px = x + std::cos(rad) * rx;
                 const float py = y + std::sin(rad) * ry;
-                vertices.insert(vertices.end(), {px, py, 0.f});
+                vertices.insert(vertices.end(), {px, py, 0.F});
               }
             }
             break;
@@ -445,9 +445,9 @@ void GeometryLayer::end(Vulkan* vulkan) {
 
       // ImGui is using integer coordinates for the text position, we use the 0...1 range.
       // Therefore generate vertices in larger scale and scale them down afterwards.
-      const float scale = 16384.f;
-      const ImVec4 clip_rect(0.f, 0.f, scale * std::max(1.f, impl_->aspect_ratio_), scale);
-      const float inv_scale = 1.f / scale;
+      const float scale = 16384.F;
+      const ImVec4 clip_rect(0.F, 0.F, scale * std::max(1.F, impl_->aspect_ratio_), scale);
+      const float inv_scale = 1.F / scale;
 
       ImDrawVert *vertex_base = nullptr, *vertex = nullptr;
       for (auto&& text : impl_->texts_) {
@@ -471,7 +471,7 @@ void GeometryLayer::end(Vulkan* vulkan) {
         }
         while (vertex < impl_->text_draw_list_->_VtxWritePtr) {
           vertex->pos.x =
-              (vertex->pos.x * inv_scale - text.x_) * (1.f / impl_->aspect_ratio_) + text.x_;
+              (vertex->pos.x * inv_scale - text.x_) * (1.F / impl_->aspect_ratio_) + text.x_;
           vertex->pos.y *= inv_scale;
           ++vertex;
         }
@@ -609,8 +609,8 @@ void GeometryLayer::render(Vulkan* vulkan) {
   // setup the 2D view matrix in a way that geometry coordinates are in the range [0...1]
   nvmath::mat4f view_matrix_2d_base;
   view_matrix_2d_base.identity();
-  view_matrix_2d_base.scale({2.f, 2.f, 1.f});
-  view_matrix_2d_base.translate({-.5f, -.5f, 0.f});
+  view_matrix_2d_base.scale({2.F, 2.F, 1.F});
+  view_matrix_2d_base.translate({-.5F, -.5F, 0.F});
 
   std::vector<Layer::View> views = get_views();
   if (views.empty()) { views.push_back(Layer::View()); }
diff --git a/modules/holoviz/src/layers/im_gui_layer.cpp b/modules/holoviz/src/layers/im_gui_layer.cpp
index de675bec..fa2f64be 100644
--- a/modules/holoviz/src/layers/im_gui_layer.cpp
+++ b/modules/holoviz/src/layers/im_gui_layer.cpp
@@ -94,9 +94,9 @@ void ImGuiLayer::render(Vulkan* vulkan) {
   // setup the base view matrix in a way that coordinates are in the range [0...1]
   nvmath::mat4f view_matrix_base;
   view_matrix_base.identity();
-  view_matrix_base.translate({-1.f, -1.f, 0.f});
+  view_matrix_base.translate({-1.F, -1.F, 0.F});
   view_matrix_base.scale(
-      {2.f / impl_->draw_data_->DisplaySize.x, 2.f / impl_->draw_data_->DisplaySize.y, 1.f});
+      {2.F / impl_->draw_data_->DisplaySize.x, 2.F / impl_->draw_data_->DisplaySize.y, 1.F});
 
   std::vector<Layer::View> views = get_views();
   if (views.empty()) { views.push_back(Layer::View()); }
diff --git a/modules/holoviz/src/layers/layer.cpp b/modules/holoviz/src/layers/layer.cpp
index 7f8b5b5d..b2c3de67 100644
--- a/modules/holoviz/src/layers/layer.cpp
+++ b/modules/holoviz/src/layers/layer.cpp
@@ -24,7 +24,7 @@ namespace holoscan::viz {
 
 struct Layer::Impl {
   int32_t priority_ = 0;
-  float opacity_ = 1.f;
+  float opacity_ = 1.F;
   std::vector<View> views_;
 };
 
@@ -53,7 +53,7 @@ float Layer::get_opacity() const {
 }
 
 void Layer::set_opacity(float opacity) {
-  if ((opacity < 0.f) || (opacity > 1.f)) {
+  if ((opacity < 0.F) || (opacity > 1.F)) {
     throw std::invalid_argument("Layer opacity should be in the range [0.0 ... 1.0]");
   }
   impl_->opacity_ = opacity;
diff --git a/modules/holoviz/src/layers/layer.hpp b/modules/holoviz/src/layers/layer.hpp
index e99794b8..77abf06c 100644
--- a/modules/holoviz/src/layers/layer.hpp
+++ b/modules/holoviz/src/layers/layer.hpp
@@ -90,9 +90,9 @@ class Layer {
    */
   struct View {
     /// offset of top-left corner. Top left coordinate is (0, 0) bottom right coordinate is (1, 1).
-    float offset_x = 0.f, offset_y = 0.f;
+    float offset_x = 0.F, offset_y = 0.F;
     /// width and height of the layer in normalized range. 1.0 is full size.
-    float width = 1.f, height = 1.f;
+    float width = 1.F, height = 1.F;
 
     /// transform matrix
     std::optional<nvmath::mat4f> matrix;
diff --git a/modules/holoviz/src/vulkan/format_util.cpp b/modules/holoviz/src/vulkan/format_util.cpp
index 4e7968d6..172358fc 100644
--- a/modules/holoviz/src/vulkan/format_util.cpp
+++ b/modules/holoviz/src/vulkan/format_util.cpp
@@ -30,7 +30,7 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
     case ImageFormat::R8_UNORM:
     case ImageFormat::R8_SNORM:
     case ImageFormat::R8_SRGB:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint8_t);
       break;
     case ImageFormat::R16_UINT:
@@ -38,7 +38,7 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
     case ImageFormat::R16_UNORM:
     case ImageFormat::R16_SNORM:
     case ImageFormat::R16_SFLOAT:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint16_t);
       break;
     case ImageFormat::R32_UINT:
@@ -46,18 +46,18 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
     // packed formats are treated as single component formats
     case ImageFormat::A2B10G10R10_UNORM_PACK32:
     case ImageFormat::A2R10G10B10_UNORM_PACK32:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint32_t);
       break;
     case ImageFormat::R32_SFLOAT:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(float);
       break;
     case ImageFormat::R8G8B8_UNORM:
     case ImageFormat::R8G8B8_SNORM:
     case ImageFormat::R8G8B8_SRGB:
-      *channels = 3u;
-      *hw_channels = 4u;
+      *channels = 3U;
+      *hw_channels = 4U;
       *component_size = sizeof(uint8_t);
       break;
     case ImageFormat::R8G8B8A8_UNORM:
@@ -67,41 +67,41 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
     case ImageFormat::B8G8R8A8_SRGB:
     case ImageFormat::A8B8G8R8_UNORM_PACK32:
     case ImageFormat::A8B8G8R8_SRGB_PACK32:
-      *channels = *hw_channels = 4u;
+      *channels = *hw_channels = 4U;
       *component_size = sizeof(uint8_t);
       break;
     case ImageFormat::R16G16B16A16_UNORM:
     case ImageFormat::R16G16B16A16_SNORM:
     case ImageFormat::R16G16B16A16_SFLOAT:
-      *channels = *hw_channels = 4u;
+      *channels = *hw_channels = 4U;
       *component_size = sizeof(uint16_t);
       break;
     case ImageFormat::R32G32B32A32_SFLOAT:
-      *channels = *hw_channels = 4u;
+      *channels = *hw_channels = 4U;
       *component_size = sizeof(float);
       break;
     case ImageFormat::D16_UNORM:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint16_t);
       break;
     case ImageFormat::X8_D24_UNORM:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint32_t);
       break;
     case ImageFormat::D32_SFLOAT:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint32_t);
       break;
     case ImageFormat::Y8U8Y8V8_422_UNORM:
     case ImageFormat::U8Y8V8Y8_422_UNORM:
-      *channels = *hw_channels = 2u;
+      *channels = *hw_channels = 2U;
       *component_size = sizeof(uint8_t);
       break;
     case ImageFormat::Y8_U8V8_2PLANE_420_UNORM:
       if (plane == 0) {
-        *channels = *hw_channels = 1u;
+        *channels = *hw_channels = 1U;
       } else if (plane == 1) {
-        *channels = *hw_channels = 2u;
+        *channels = *hw_channels = 2U;
         if (width_divisor) { *width_divisor = 2; }
         if (height_divisor) { *height_divisor = 2; }
       } else {
@@ -111,9 +111,9 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       break;
     case ImageFormat::Y8_U8V8_2PLANE_422_UNORM:
       if (plane == 0) {
-        *channels = *hw_channels = 1u;
+        *channels = *hw_channels = 1U;
       } else if (plane == 1) {
-        *channels = *hw_channels = 2u;
+        *channels = *hw_channels = 2U;
         if (width_divisor) { *width_divisor = 2; }
       } else {
         throw std::invalid_argument("Unhandled plane index");
@@ -121,7 +121,7 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       *component_size = sizeof(uint8_t);
       break;
     case ImageFormat::Y8_U8_V8_3PLANE_420_UNORM:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint8_t);
       if (plane == 0) {
       } else if ((plane == 1) || (plane == 2)) {
@@ -132,7 +132,7 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       }
       break;
     case ImageFormat::Y8_U8_V8_3PLANE_422_UNORM:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint8_t);
       if (plane == 0) {
       } else if ((plane == 1) || (plane == 2)) {
@@ -143,9 +143,9 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       break;
     case ImageFormat::Y16_U16V16_2PLANE_420_UNORM:
       if (plane == 0) {
-        *channels = *hw_channels = 1u;
+        *channels = *hw_channels = 1U;
       } else if (plane == 1) {
-        *channels = *hw_channels = 2u;
+        *channels = *hw_channels = 2U;
         if (width_divisor) { *width_divisor = 2; }
         if (height_divisor) { *height_divisor = 2; }
       } else {
@@ -155,9 +155,9 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       break;
     case ImageFormat::Y16_U16V16_2PLANE_422_UNORM:
       if (plane == 0) {
-        *channels = *hw_channels = 1u;
+        *channels = *hw_channels = 1U;
       } else if (plane == 1) {
-        *channels = *hw_channels = 2u;
+        *channels = *hw_channels = 2U;
         if (width_divisor) { *width_divisor = 2; }
       } else {
         throw std::invalid_argument("Unhandled plane index");
@@ -165,7 +165,7 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       *component_size = sizeof(uint16_t);
       break;
     case ImageFormat::Y16_U16_V16_3PLANE_420_UNORM:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint16_t);
       if (plane == 0) {
       } else if ((plane == 1) || (plane == 2)) {
@@ -176,7 +176,7 @@ void format_info(ImageFormat format, uint32_t* channels, uint32_t* hw_channels,
       }
       break;
     case ImageFormat::Y16_U16_V16_3PLANE_422_UNORM:
-      *channels = *hw_channels = 1u;
+      *channels = *hw_channels = 1U;
       *component_size = sizeof(uint16_t);
       if (plane == 0) {
       } else if ((plane == 1) || (plane == 2)) {
diff --git a/modules/holoviz/src/vulkan/framebuffer_sequence.cpp b/modules/holoviz/src/vulkan/framebuffer_sequence.cpp
index 39a42ad4..de55f934 100644
--- a/modules/holoviz/src/vulkan/framebuffer_sequence.cpp
+++ b/modules/holoviz/src/vulkan/framebuffer_sequence.cpp
@@ -242,15 +242,15 @@ void FramebufferSequence::update(uint32_t width, uint32_t height, PresentMode pr
       // everyone must support FIFO mode
       vk_present_mode = vk::PresentModeKHR::eFifo;
       // try to find a non-blocking alternative to FIFO
-      for (auto present_mode : present_modes) {
-        if (present_mode == vk::PresentModeKHR::eMailbox) {
+      for (auto mode : present_modes) {
+        if (mode == vk::PresentModeKHR::eMailbox) {
           // prefer mailbox due to no tearing
-          vk_present_mode = present_mode;
+          vk_present_mode = mode;
           break;
         }
-        if (present_mode == vk::PresentModeKHR::eImmediate) {
+        if (mode == vk::PresentModeKHR::eImmediate) {
           // immediate mode is non-blocking, but has tearing
-          vk_present_mode = present_mode;
+          vk_present_mode = mode;
         }
       }
     } else {
diff --git a/modules/holoviz/src/vulkan/shaders/image_shader.glsl.frag b/modules/holoviz/src/vulkan/shaders/image_shader.glsl.frag
index 5383f347..a1a0cfd0 100644
--- a/modules/holoviz/src/vulkan/shaders/image_shader.glsl.frag
+++ b/modules/holoviz/src/vulkan/shaders/image_shader.glsl.frag
@@ -50,20 +50,20 @@ void main()
   } else {
     if ((push_constants.fragment.flags & PUSH_CONSTANT_FRAGMENT_FLAG_LUT) != 0) {
       const float index = texture(colorSampler, i_texCoord).x;
-      color = textureLod(lutSampler, vec2(index, 0.f), 0);
+      color = textureLod(lutSampler, vec2(index, 0.F), 0);
     } else if ((push_constants.fragment.flags & PUSH_CONSTANT_FRAGMENT_FLAG_LUT_U) != 0) {
       const uint index = texture(coloruSampler, i_texCoord).x;
-      color = textureLod(lutSampler, vec2(float(index), 0.f), 0);
+      color = textureLod(lutSampler, vec2(float(index), 0.F), 0);
     } else if ((push_constants.fragment.flags & PUSH_CONSTANT_FRAGMENT_FLAG_LUT_S) != 0) {
       const uint index = texture(coloriSampler, i_texCoord).x;
-      color = textureLod(lutSampler, vec2(float(index), 0.f), 0);
+      color = textureLod(lutSampler, vec2(float(index), 0.F), 0);
     }
   }
 
   color.a *= push_constants.fragment.opacity;
 
   // discard transparent fragments
-  if (color.a == 0.f)
+  if (color.a == 0.F)
       discard;
 
   o_color = color;
diff --git a/modules/holoviz/src/vulkan/shaders/image_shader.glsl.vert b/modules/holoviz/src/vulkan/shaders/image_shader.glsl.vert
index 51e736b1..a3dde3c3 100644
--- a/modules/holoviz/src/vulkan/shaders/image_shader.glsl.vert
+++ b/modules/holoviz/src/vulkan/shaders/image_shader.glsl.vert
@@ -34,5 +34,5 @@ layout(push_constant) uniform constants {
 void main()
 {
     gl_Position = push_constants.vertex.matrix * vec4(i_position, 0.0, 1.0);
-    o_texCoord  = (i_position + vec2(1.f)) * vec2(0.5f);
+    o_texCoord  = (i_position + vec2(1.F)) * vec2(0.5F);
 }
\ No newline at end of file
diff --git a/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.frag b/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.frag
index ea9fccc9..dd500a1d 100644
--- a/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.frag
+++ b/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.frag
@@ -44,7 +44,7 @@ void main()
     color.a *= push_constants.fragment.opacity;
 
     // discard transparent fragments
-    if (color.a == 0.f)
+    if (color.a == 0.F)
         discard;
 
     o_color = color;
diff --git a/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.vert b/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.vert
index f347ef8b..e6943398 100644
--- a/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.vert
+++ b/modules/holoviz/src/vulkan/shaders/imgui_shader.glsl.vert
@@ -44,5 +44,5 @@ void main()
     Out.color    = i_color;
     Out.texCoord = i_texCoord;
 
-    gl_Position = push_constants.vertex.matrix * vec4(i_position.x, i_position.y, 0.f, 1.0);
+    gl_Position = push_constants.vertex.matrix * vec4(i_position.x, i_position.y, 0.F, 1.0);
 }
\ No newline at end of file
diff --git a/modules/holoviz/src/vulkan/texture.cpp b/modules/holoviz/src/vulkan/texture.cpp
index 99ada3e5..8e89fa31 100644
--- a/modules/holoviz/src/vulkan/texture.cpp
+++ b/modules/holoviz/src/vulkan/texture.cpp
@@ -178,9 +178,9 @@ void Texture::upload(CUstream ext_stream, const std::array<CUdeviceptr, 3>& devi
           // allocate temporary memory, note this is using the stream ordered memory allocator which
           // is not syncing globally like the normal `cuMemAlloc`
           tmp_device_ptr.reset([size = tmp_pitch * height, stream] {
-            CUdeviceptr device_ptr;
-            CudaCheck(cuMemAllocAsync(&device_ptr, size, stream));
-            return std::pair<CUdeviceptr, CUstream>(device_ptr, stream);
+            CUdeviceptr dev_ptr;
+            CudaCheck(cuMemAllocAsync(&dev_ptr, size, stream));
+            return std::pair<CUdeviceptr, CUstream>(dev_ptr, stream);
           }());
 
           CUDA_MEMCPY2D memcpy_2d{};
diff --git a/modules/holoviz/src/vulkan/vulkan_app.cpp b/modules/holoviz/src/vulkan/vulkan_app.cpp
index f1957971..4ad466a3 100644
--- a/modules/holoviz/src/vulkan/vulkan_app.cpp
+++ b/modules/holoviz/src/vulkan/vulkan_app.cpp
@@ -401,11 +401,11 @@ void Vulkan::Impl::setup(Window* window, const std::string& font_path, float fon
         physical_device_
             .getProperties2<vk::PhysicalDeviceProperties2, vk::PhysicalDeviceIDProperties>();
 
-    HOLOSCAN_LOG_INFO("Using device {}: {} (UUID {:x})",
-                      device_index,
-                      properties.get<vk::PhysicalDeviceProperties2>().properties.deviceName,
-                      fmt::join(properties.get<vk::PhysicalDeviceIDProperties>().deviceUUID,
-                      ""));
+    HOLOSCAN_LOG_INFO(
+        "Using device {}: {} (UUID {:x})",
+        device_index,
+        std::string{properties.get<vk::PhysicalDeviceProperties2>().properties.deviceName},
+        fmt::join(properties.get<vk::PhysicalDeviceIDProperties>().deviceUUID, ""));
 
     // CUDA initialization
     CUuuid cuda_uuid;
@@ -458,7 +458,7 @@ void Vulkan::Impl::setup(Window* window, const std::string& font_path, float fon
     vk::CommandBuffer cmd_buf = cmd_buf_get.createCommandBuffer();
 
     const std::vector<float> vertices{
-        -1.0f, -1.0f, 0.f, 1.0f, -1.0f, 0.f, 1.0f, 1.0f, 0.f, -1.0f, 1.0f, 0.f};
+        -1.0F, -1.0F, 0.F, 1.0F, -1.0F, 0.F, 1.0F, 1.0F, 0.F, -1.0F, 1.0F, 0.F};
     nvvk_.vertex_buffer_ =
         nvvk_.alloc_.createBuffer(cmd_buf, vertices, vk::BufferUsageFlagBits::eVertexBuffer);
     const std::vector<uint16_t> indices{0, 2, 1, 2, 0, 3};
@@ -518,7 +518,7 @@ void Vulkan::Impl::setup(Window* window, const std::string& font_path, float fon
     info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
     info.minLod = -1000;
     info.maxLod = 1000;
-    info.maxAnisotropy = 1.0f;
+    info.maxAnisotropy = 1.0F;
     sampler_imgui_ = nvvk_.alloc_.acquireSampler(info);
   }
   nvvk::DescriptorSetBindings desc_set_layout_bind_imgui;
@@ -682,7 +682,7 @@ void Vulkan::Impl::init_im_gui(const std::string& font_path, float font_size_in_
     ImFontConfig font_config;
     font_config.FontDataOwnedByAtlas = false;
     // add the Roboto Bold fond as the default font
-    font_size_in_pixels = 25.f;
+    font_size_in_pixels = 25.F;
     font = io.Fonts->AddFontFromMemoryTTF(
         roboto_bold_ttf, sizeof(roboto_bold_ttf), font_size_in_pixels, &font_config);
     if (!font) { throw std::runtime_error("Failed to add default font."); }
@@ -690,7 +690,7 @@ void Vulkan::Impl::init_im_gui(const std::string& font_path, float font_size_in_
 
   // the size of the ImGui default font is 13 pixels, set the global font scale so that the
   // GUI text has the same size as with the default font.
-  io.FontGlobalScale = 13.f / font_size_in_pixels;
+  io.FontGlobalScale = 13.F / font_size_in_pixels;
 
   // build the font atlast
   if (!io.Fonts->Build()) { throw std::runtime_error("Failed to build font atlas."); }
@@ -761,8 +761,8 @@ void Vulkan::Impl::begin_render_pass() {
 
   // Clearing values
   std::array<vk::ClearValue, 2> clear_values;
-  clear_values[0].color = vk::ClearColorValue(std::array<float, 4>({0.f, 0.f, 0.f, 0.f}));
-  clear_values[1].depthStencil = vk::ClearDepthStencilValue(1.0f, 0);
+  clear_values[0].color = vk::ClearColorValue(std::array<float, 4>({0.F, 0.F, 0.F, 0.F}));
+  clear_values[1].depthStencil = vk::ClearDepthStencilValue(1.0F, 0);
 
   // Begin rendering
   vk::RenderPassBeginInfo render_pass_begin_info;
@@ -775,7 +775,7 @@ void Vulkan::Impl::begin_render_pass() {
 
   // set the dynamic viewport
   vk::Viewport viewport{
-      0.0f, 0.0f, static_cast<float>(size_.width), static_cast<float>(size_.height), 0.0f, 1.0f};
+      0.0F, 0.0F, static_cast<float>(size_.width), static_cast<float>(size_.height), 0.0F, 1.0F};
   cmd_buf.setViewport(0, viewport);
 
   vk::Rect2D scissor{{0, 0}, {size_.width, size_.height}};
@@ -1226,7 +1226,7 @@ uint32_t Vulkan::Impl::get_memory_type(uint32_t typeBits,
   }
   std::string err = "Unable to find memory type " + vk::to_string(properties);
   HOLOSCAN_LOG_ERROR("{}", err.c_str());
-  return ~0u;
+  return ~0U;
 }
 
 vk::UniquePipeline Vulkan::Impl::create_pipeline(
@@ -1429,7 +1429,7 @@ void Vulkan::Impl::set_viewport(float x, float y, float width, float height) {
   const vk::CommandBuffer cmd_buf = command_buffers_[get_active_image_index()].get();
 
   vk::Viewport viewport{
-      x * fb_width, y * fb_height, width * fb_width, height * fb_height, 0.0f, 1.0f};
+      x * fb_width, y * fb_height, width * fb_width, height * fb_height, 0.0F, 1.0F};
   cmd_buf.setViewport(0, viewport);
 
   // height can be negative to flip the rendering, but scissor needs to be positive.
@@ -1437,10 +1437,10 @@ void Vulkan::Impl::set_viewport(float x, float y, float width, float height) {
     height = -height;
     y -= height;
   }
-  vk::Rect2D scissor{{std::max(0, static_cast<int32_t>(x * fb_width + .5f)),
-                      std::max(0, static_cast<int32_t>(y * fb_height + .5f))},
-                     {static_cast<uint32_t>(width * fb_width + .5f),
-                      static_cast<uint32_t>(height * fb_height + .5f)}};
+  vk::Rect2D scissor{{std::max(0, static_cast<int32_t>(x * fb_width + .5F)),
+                      std::max(0, static_cast<int32_t>(y * fb_height + .5F))},
+                     {static_cast<uint32_t>(width * fb_width + .5F),
+                      static_cast<uint32_t>(height * fb_height + .5F)}};
   cmd_buf.setScissor(0, scissor);
 }
 
@@ -1513,7 +1513,7 @@ void Vulkan::Impl::upload_to_texture(Texture* texture, const std::array<const vo
 
     const uint32_t src_pitch = width * channels * component_size;
     const uint32_t dst_pitch = width * hw_channels * component_size;
-    const vk::DeviceSize data_size = dst_pitch * height;
+    const vk::DeviceSize data_size = static_cast<uint64_t>(dst_pitch) * height;
 
     void* mapping =
         nvvk_.alloc_.getStaging()->cmdToImage(cmd_buf,
@@ -1942,9 +1942,9 @@ void Vulkan::Impl::draw_imgui(vk::DescriptorSet desc_set, Buffer* vertex_buffer,
                first_index,
                vertex_offset,
                opacity,
-               std::array<float, 4>({1.f, 1.f, 1.f, 1.f}),
-               1.f,
-               0.f,
+               std::array<float, 4>({1.F, 1.F, 1.F, 1.F}),
+               1.F,
+               0.F,
                view_matrix);
 }
 
@@ -1961,7 +1961,7 @@ void Vulkan::Impl::draw_indexed(vk::Pipeline pipeline, vk::PipelineLayout pipeli
     vertex_buffer->access_with_vulkan(nvvk_.batch_submission_);
   }
 
-  if (line_width > 0.f) { cmd_buf.setLineWidth(line_width); }
+  if (line_width > 0.F) { cmd_buf.setLineWidth(line_width); }
 
   if (desc_set) {
     cmd_buf.bindDescriptorSets(
diff --git a/modules/holoviz/src/window.cpp b/modules/holoviz/src/window.cpp
index d45de701..53ad4cec 100644
--- a/modules/holoviz/src/window.cpp
+++ b/modules/holoviz/src/window.cpp
@@ -24,7 +24,7 @@ namespace holoscan::viz {
 Window::Window() {
   // setup camera
   CameraManip.setLookat(
-      nvmath::vec3f(0.f, 0.f, 1.f), nvmath::vec3f(0.f, 0.f, 0.f), nvmath::vec3f(0.f, 1.f, 0.f));
+      nvmath::vec3f(0.F, 0.F, 1.F), nvmath::vec3f(0.F, 0.F, 0.F), nvmath::vec3f(0.F, 1.F, 0.F));
 }
 
 void Window::end() {
@@ -38,7 +38,7 @@ void Window::set_camera(const nvmath::vec3f& eye, const nvmath::vec3f& look_at,
 }
 
 void Window::get_view_matrix(nvmath::mat4f* view_matrix) {
-  *view_matrix = nvmath::perspectiveVK(CameraManip.getFov(), 1.f /*aspectRatio*/, 0.1f, 1000.0f) *
+  *view_matrix = nvmath::perspectiveVK(CameraManip.getFov(), 1.F /*aspectRatio*/, 0.1F, 1000.0F) *
                  CameraManip.getMatrix();
 }
 
diff --git a/modules/holoviz/tests/functional/camera_pose_test.cpp b/modules/holoviz/tests/functional/camera_pose_test.cpp
index 33b05c59..f6317c3f 100644
--- a/modules/holoviz/tests/functional/camera_pose_test.cpp
+++ b/modules/holoviz/tests/functional/camera_pose_test.cpp
@@ -27,7 +27,7 @@ namespace viz = holoscan::viz;
 class CameraPose : public TestHeadless {};
 
 TEST_F(CameraPose, Set) {
-  EXPECT_NO_THROW(viz::SetCamera(1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 0.f, 1.f, 0.f));
+  EXPECT_NO_THROW(viz::SetCamera(1.F, 2.F, 3.F, 4.F, 5.F, 6.F, 0.F, 1.F, 0.F));
 
   float rotation[9];
   float translation[3];
@@ -36,19 +36,19 @@ TEST_F(CameraPose, Set) {
   // There are test errors on some systems when using EXPECT_FLOAT_EQ() (includes a error margin of
   // 4 ULP, see https://google.github.io/googletest/reference/assertions.html#floating-point).
   // Use EXPECT_NEAR() with a higher epsilon.
-  constexpr float epsilon = 1e-6f;
-  EXPECT_NEAR(rotation[0], -0.707106769f, epsilon);
-  EXPECT_NEAR(rotation[1], 0.f, epsilon);
-  EXPECT_NEAR(rotation[2], 0.707106769f, epsilon);
-  EXPECT_NEAR(rotation[3], -0.408248335f, epsilon);
-  EXPECT_NEAR(rotation[4], 0.81649667f, epsilon);
-  EXPECT_NEAR(rotation[5], -0.408248335f, epsilon);
-  EXPECT_NEAR(rotation[6], -0.577350259f, epsilon);
-  EXPECT_NEAR(rotation[7], -0.577350259f, epsilon);
-  EXPECT_NEAR(rotation[8], -0.577350259f, epsilon);
-  EXPECT_NEAR(translation[0], -1.41421342f, epsilon);
-  EXPECT_NEAR(translation[1], 0.f, epsilon);
-  EXPECT_NEAR(translation[2], 3.46410155f, epsilon);
+  constexpr float epsilon = 1e-6F;
+  EXPECT_NEAR(rotation[0], -0.707106769F, epsilon);
+  EXPECT_NEAR(rotation[1], 0.F, epsilon);
+  EXPECT_NEAR(rotation[2], 0.707106769F, epsilon);
+  EXPECT_NEAR(rotation[3], -0.408248335F, epsilon);
+  EXPECT_NEAR(rotation[4], 0.81649667F, epsilon);
+  EXPECT_NEAR(rotation[5], -0.408248335F, epsilon);
+  EXPECT_NEAR(rotation[6], -0.577350259F, epsilon);
+  EXPECT_NEAR(rotation[7], -0.577350259F, epsilon);
+  EXPECT_NEAR(rotation[8], -0.577350259F, epsilon);
+  EXPECT_NEAR(translation[0], -1.41421342F, epsilon);
+  EXPECT_NEAR(translation[1], 0.F, epsilon);
+  EXPECT_NEAR(translation[2], 3.46410155F, epsilon);
 }
 
 TEST_F(CameraPose, GetDefault) {
@@ -58,12 +58,12 @@ TEST_F(CameraPose, GetDefault) {
   EXPECT_NO_THROW(viz::GetCameraPose(rotation, translation));
   for (uint32_t row = 0; row < 3; ++row) {
     for (uint32_t col = 0; col < 3; ++col) {
-      EXPECT_FLOAT_EQ(rotation[row * 3 + col], ((row == col) ? 1.f : 0.f));
+      EXPECT_FLOAT_EQ(rotation[row * 3 + col], ((row == col) ? 1.F : 0.F));
     }
   }
-  EXPECT_FLOAT_EQ(translation[0], 0.f);
-  EXPECT_FLOAT_EQ(translation[1], 0.f);
-  EXPECT_FLOAT_EQ(translation[2], -1.f);
+  EXPECT_FLOAT_EQ(translation[0], 0.F);
+  EXPECT_FLOAT_EQ(translation[1], 0.F);
+  EXPECT_FLOAT_EQ(translation[2], -1.F);
 
   std::array<float, 16> pose;
   // it's an error to specify a size less than 16
@@ -72,29 +72,29 @@ TEST_F(CameraPose, GetDefault) {
   EXPECT_THROW(viz::GetCameraPose(16, nullptr), std::invalid_argument);
 
   // this is the default setup for the matrix, see Window class constructor
-  std::array<float, 16> expected_pose{1.73205066f,
-                                      0.f,
-                                      0.f,
-                                      0.f,
-                                      0.f,
-                                      -1.73205066f,
-                                      0.f,
-                                      0.f,
-                                      0.f,
-                                      0.f,
-                                      -1.00010002f,
-                                      0.900090039f,
-                                      0.f,
-                                      0.f,
-                                      -1.f,
-                                      1.f};
+  std::array<float, 16> expected_pose{1.73205066F,
+                                      0.F,
+                                      0.F,
+                                      0.F,
+                                      0.F,
+                                      -1.73205066F,
+                                      0.F,
+                                      0.F,
+                                      0.F,
+                                      0.F,
+                                      -1.00010002F,
+                                      0.900090039F,
+                                      0.F,
+                                      0.F,
+                                      -1.F,
+                                      1.F};
   EXPECT_NO_THROW(viz::GetCameraPose(pose.size(), pose.data()));
   for (int i = 0; i < 16; ++i) { EXPECT_FLOAT_EQ(pose[i], expected_pose[i]); }
 }
 
 TEST_F(CameraPose, Anim) {
   // move the camera in x direction
-  EXPECT_NO_THROW(viz::SetCamera(100.f, 0.f, 1.f, 100.f, 0.f, 0.f, 0.f, 1.f, 0.f, true));
+  EXPECT_NO_THROW(viz::SetCamera(100.F, 0.F, 1.F, 100.F, 0.F, 0.F, 0.F, 1.F, 0.F, true));
 
   // start animation (duration default is 500 ms)
   EXPECT_NO_THROW(viz::Begin());
@@ -106,9 +106,9 @@ TEST_F(CameraPose, Anim) {
 
   EXPECT_NO_THROW(viz::GetCameraPose(rotation, translation));
   // translation has changed
-  EXPECT_NE(translation[0], 0.f);
-  EXPECT_FLOAT_EQ(translation[1], 0.f);
-  EXPECT_NE(translation[2], -1.f);
+  EXPECT_NE(translation[0], 0.F);
+  EXPECT_FLOAT_EQ(translation[1], 0.F);
+  EXPECT_NE(translation[2], -1.F);
 
   // wait for the end
   EXPECT_NO_THROW(viz::Begin());
@@ -118,11 +118,11 @@ TEST_F(CameraPose, Anim) {
   EXPECT_NO_THROW(viz::GetCameraPose(rotation, translation));
   for (uint32_t row = 0; row < 3; ++row) {
     for (uint32_t col = 0; col < 3; ++col) {
-      EXPECT_FLOAT_EQ(rotation[row * 3 + col], ((row == col) ? 1.f : 0.f));
+      EXPECT_FLOAT_EQ(rotation[row * 3 + col], ((row == col) ? 1.F : 0.F));
     }
   }
 
-  EXPECT_FLOAT_EQ(translation[0], -100.f);
-  EXPECT_FLOAT_EQ(translation[1], 0.f);
-  EXPECT_FLOAT_EQ(translation[2], -1.f);
+  EXPECT_FLOAT_EQ(translation[0], -100.F);
+  EXPECT_FLOAT_EQ(translation[1], 0.F);
+  EXPECT_FLOAT_EQ(translation[2], -1.F);
 }
diff --git a/modules/holoviz/tests/functional/geometry_layer_test.cpp b/modules/holoviz/tests/functional/geometry_layer_test.cpp
index 4fd91fd4..b44ba8b7 100644
--- a/modules/holoviz/tests/functional/geometry_layer_test.cpp
+++ b/modules/holoviz/tests/functional/geometry_layer_test.cpp
@@ -71,22 +71,22 @@ TEST_P(PrimitiveTopology, Primitive) {
   switch (topology) {
     case viz::PrimitiveTopology::POINT_LIST:
       primitive_count = 1;
-      data.push_back(0.5f);
-      data.push_back(0.5f);
+      data.push_back(0.5F);
+      data.push_back(0.5F);
       color_crc = {0x3088e839};
       depth_crc = {0x748e4c96};
       break;
     case viz::PrimitiveTopology::LINE_LIST:
       primitive_count = 2;
-      data.push_back(0.1f);
-      data.push_back(0.1f);
-      data.push_back(0.9f);
-      data.push_back(0.9f);
-
-      data.push_back(0.7f);
-      data.push_back(0.3f);
-      data.push_back(0.2f);
-      data.push_back(0.4f);
+      data.push_back(0.1F);
+      data.push_back(0.1F);
+      data.push_back(0.9F);
+      data.push_back(0.9F);
+
+      data.push_back(0.7F);
+      data.push_back(0.3F);
+      data.push_back(0.2F);
+      data.push_back(0.4F);
       color_crc = {
           0xe96c7246,  // Quadro
           0x5f7bf4d3   // non-Quadro
@@ -98,13 +98,13 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::LINE_STRIP:
       primitive_count = 2;
-      data.push_back(0.1f);
-      data.push_back(0.1f);
-      data.push_back(0.7f);
-      data.push_back(0.9f);
+      data.push_back(0.1F);
+      data.push_back(0.1F);
+      data.push_back(0.7F);
+      data.push_back(0.9F);
 
-      data.push_back(0.3f);
-      data.push_back(0.2f);
+      data.push_back(0.3F);
+      data.push_back(0.2F);
       color_crc = {
           0x162496c0,  // Quadro
           0x9118f5cb   // non-Quadro
@@ -116,31 +116,31 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::TRIANGLE_LIST:
       primitive_count = 2;
-      data.push_back(0.1f);
-      data.push_back(0.1f);
-      data.push_back(0.5f);
-      data.push_back(0.9f);
-      data.push_back(0.9f);
-      data.push_back(0.1f);
-
-      data.push_back(0.05f);
-      data.push_back(0.7f);
-      data.push_back(0.15f);
-      data.push_back(0.8f);
-      data.push_back(0.25f);
-      data.push_back(0.6f);
+      data.push_back(0.1F);
+      data.push_back(0.1F);
+      data.push_back(0.5F);
+      data.push_back(0.9F);
+      data.push_back(0.9F);
+      data.push_back(0.1F);
+
+      data.push_back(0.05F);
+      data.push_back(0.7F);
+      data.push_back(0.15F);
+      data.push_back(0.8F);
+      data.push_back(0.25F);
+      data.push_back(0.6F);
       color_crc = {0x9de9f5f3};
       depth_crc = {0x101577b};
       break;
     case viz::PrimitiveTopology::CROSS_LIST:
       primitive_count = 2;
-      data.push_back(0.5f);
-      data.push_back(0.5f);
-      data.push_back(0.1f);
+      data.push_back(0.5F);
+      data.push_back(0.5F);
+      data.push_back(0.1F);
 
-      data.push_back(0.1f);
-      data.push_back(0.3f);
-      data.push_back(0.01f);
+      data.push_back(0.1F);
+      data.push_back(0.3F);
+      data.push_back(0.01F);
       color_crc = {
           0xb507fa88,  // Quadro
           0xf298654    // non-Quadro
@@ -152,15 +152,15 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::RECTANGLE_LIST:
       primitive_count = 2;
-      data.push_back(0.1f);
-      data.push_back(0.1f);
-      data.push_back(0.9f);
-      data.push_back(0.9f);
-
-      data.push_back(0.3f);
-      data.push_back(0.2f);
-      data.push_back(0.5f);
-      data.push_back(0.3f);
+      data.push_back(0.1F);
+      data.push_back(0.1F);
+      data.push_back(0.9F);
+      data.push_back(0.9F);
+
+      data.push_back(0.3F);
+      data.push_back(0.2F);
+      data.push_back(0.5F);
+      data.push_back(0.3F);
       color_crc = {
           0x19a05481,  // Quadro
           0xf1f8f1b3   // non-Quadro
@@ -172,15 +172,15 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::OVAL_LIST:
       primitive_count = 2;
-      data.push_back(0.5f);
-      data.push_back(0.5f);
-      data.push_back(0.2f);
-      data.push_back(0.1f);
-
-      data.push_back(0.6f);
-      data.push_back(0.4f);
-      data.push_back(0.05f);
-      data.push_back(0.07f);
+      data.push_back(0.5F);
+      data.push_back(0.5F);
+      data.push_back(0.2F);
+      data.push_back(0.1F);
+
+      data.push_back(0.6F);
+      data.push_back(0.4F);
+      data.push_back(0.05F);
+      data.push_back(0.07F);
       color_crc = {
           0x2341eef6,  // Quadro
           0xae3f0636   // non-Quadro
@@ -192,27 +192,27 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::POINT_LIST_3D:
       primitive_count = 1;
-      data.push_back(-0.5f);
-      data.push_back(0.5f);
-      data.push_back(0.8f);
+      data.push_back(-0.5F);
+      data.push_back(0.5F);
+      data.push_back(0.8F);
       color_crc = {0xd8f49994};
       depth_crc = {0x4e371ba0};
       break;
     case viz::PrimitiveTopology::LINE_LIST_3D:
       primitive_count = 2;
-      data.push_back(-0.1f);
-      data.push_back(-0.1f);
-      data.push_back(0.1f);
-      data.push_back(0.9f);
-      data.push_back(0.9f);
-      data.push_back(0.3f);
-
-      data.push_back(-0.7f);
-      data.push_back(-0.3f);
-      data.push_back(0.2f);
-      data.push_back(0.2f);
-      data.push_back(0.4f);
-      data.push_back(0.5f);
+      data.push_back(-0.1F);
+      data.push_back(-0.1F);
+      data.push_back(0.1F);
+      data.push_back(0.9F);
+      data.push_back(0.9F);
+      data.push_back(0.3F);
+
+      data.push_back(-0.7F);
+      data.push_back(-0.3F);
+      data.push_back(0.2F);
+      data.push_back(0.2F);
+      data.push_back(0.4F);
+      data.push_back(0.5F);
       color_crc = {
           0xc7762cc5,  // Quadro
           0xe9f3dbc3   // non-Quadro
@@ -224,16 +224,16 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::LINE_STRIP_3D:
       primitive_count = 2;
-      data.push_back(-0.1f);
-      data.push_back(-0.1f);
-      data.push_back(0.1f);
-      data.push_back(0.7f);
-      data.push_back(0.9f);
-      data.push_back(0.3f);
-
-      data.push_back(-0.3f);
-      data.push_back(-0.2f);
-      data.push_back(0.2f);
+      data.push_back(-0.1F);
+      data.push_back(-0.1F);
+      data.push_back(0.1F);
+      data.push_back(0.7F);
+      data.push_back(0.9F);
+      data.push_back(0.3F);
+
+      data.push_back(-0.3F);
+      data.push_back(-0.2F);
+      data.push_back(0.2F);
       color_crc = {
           0x135ba8af,  // Quadro
           0x322d3fdd   // non-Quadro
@@ -245,25 +245,25 @@ TEST_P(PrimitiveTopology, Primitive) {
       break;
     case viz::PrimitiveTopology::TRIANGLE_LIST_3D:
       primitive_count = 2;
-      data.push_back(-0.1f);
-      data.push_back(-0.1f);
-      data.push_back(0.f);
-      data.push_back(0.5f);
-      data.push_back(0.9f);
-      data.push_back(0.1f);
-      data.push_back(0.9f);
-      data.push_back(0.1f);
-      data.push_back(0.2f);
-
-      data.push_back(-0.05f);
-      data.push_back(-0.7f);
-      data.push_back(0.3f);
-      data.push_back(0.15f);
-      data.push_back(0.8f);
-      data.push_back(0.2f);
-      data.push_back(0.25f);
-      data.push_back(0.6f);
-      data.push_back(0.5f);
+      data.push_back(-0.1F);
+      data.push_back(-0.1F);
+      data.push_back(0.F);
+      data.push_back(0.5F);
+      data.push_back(0.9F);
+      data.push_back(0.1F);
+      data.push_back(0.9F);
+      data.push_back(0.1F);
+      data.push_back(0.2F);
+
+      data.push_back(-0.05F);
+      data.push_back(-0.7F);
+      data.push_back(0.3F);
+      data.push_back(0.15F);
+      data.push_back(0.8F);
+      data.push_back(0.2F);
+      data.push_back(0.25F);
+      data.push_back(0.6F);
+      data.push_back(0.5F);
       color_crc = {0xf372dff7};
       depth_crc = {0x90e4e07d};
       break;
@@ -277,15 +277,15 @@ TEST_P(PrimitiveTopology, Primitive) {
 
   for (uint32_t i = 0; i < 3; ++i) {
     if (i == 1) {
-      EXPECT_NO_THROW(viz::Color(1.f, 0.5f, 0.25f, 0.75f));
+      EXPECT_NO_THROW(viz::Color(1.F, 0.5F, 0.25F, 0.75F));
     } else if (i == 2) {
-      EXPECT_NO_THROW(viz::PointSize(4.f));
-      EXPECT_NO_THROW(viz::LineWidth(3.f));
+      EXPECT_NO_THROW(viz::PointSize(4.F));
+      EXPECT_NO_THROW(viz::LineWidth(3.F));
     }
 
     EXPECT_NO_THROW(viz::Primitive(topology, primitive_count, data.size(), data.data()));
 
-    for (auto&& item : data) { item += 0.1f; }
+    for (auto&& item : data) { item += 0.1F; }
   }
   EXPECT_NO_THROW(viz::EndLayer());
 
@@ -311,9 +311,9 @@ TEST_F(GeometryLayer, Text) {
   EXPECT_NO_THROW(viz::Begin());
 
   EXPECT_NO_THROW(viz::BeginGeometryLayer());
-  EXPECT_NO_THROW(viz::Text(0.4f, 0.4f, 0.4f, "Text"));
-  EXPECT_NO_THROW(viz::Color(0.5f, 0.9f, 0.7f, 0.9f));
-  EXPECT_NO_THROW(viz::Text(0.1f, 0.1f, 0.2f, "Colored"));
+  EXPECT_NO_THROW(viz::Text(0.4F, 0.4F, 0.4F, "Text"));
+  EXPECT_NO_THROW(viz::Color(0.5F, 0.9F, 0.7F, 0.9F));
+  EXPECT_NO_THROW(viz::Text(0.1F, 0.1F, 0.2F, "Colored"));
   EXPECT_NO_THROW(viz::EndLayer());
 
   EXPECT_NO_THROW(viz::End());
@@ -325,7 +325,7 @@ TEST_F(GeometryLayer, TextClipped) {
   EXPECT_NO_THROW(viz::Begin());
 
   EXPECT_NO_THROW(viz::BeginGeometryLayer());
-  EXPECT_NO_THROW(viz::Text(1.1f, 0.4f, 0.4f, "Text"));
+  EXPECT_NO_THROW(viz::Text(1.1F, 0.4F, 0.4F, "Text"));
 
   EXPECT_NO_THROW(viz::End());
 
@@ -335,7 +335,7 @@ TEST_F(GeometryLayer, TextClipped) {
 class GeometryLayerWithFont : public TestHeadless {
  protected:
   void SetUp() override {
-    ASSERT_NO_THROW(viz::SetFont("../modules/holoviz/src/fonts/Roboto-Bold.ttf", 12.f));
+    ASSERT_NO_THROW(viz::SetFont("../modules/holoviz/src/fonts/Roboto-Bold.ttf", 12.F));
 
     // call base class
     ::TestHeadless::SetUp();
@@ -345,7 +345,7 @@ class GeometryLayerWithFont : public TestHeadless {
     // call base class
     ::TestHeadless::TearDown();
 
-    ASSERT_NO_THROW(viz::SetFont("", 0.f));
+    ASSERT_NO_THROW(viz::SetFont("", 0.F));
   }
 };
 
@@ -353,7 +353,7 @@ TEST_F(GeometryLayerWithFont, Text) {
   EXPECT_NO_THROW(viz::Begin());
 
   EXPECT_NO_THROW(viz::BeginGeometryLayer());
-  EXPECT_NO_THROW(viz::Text(0.1f, 0.1f, 0.7f, "Font"));
+  EXPECT_NO_THROW(viz::Text(0.1F, 0.1F, 0.7F, "Font"));
   EXPECT_NO_THROW(viz::EndLayer());
 
   EXPECT_NO_THROW(viz::End());
@@ -407,7 +407,7 @@ TEST_P(DepthMapRenderMode, DepthMap) {
     case viz::ImageFormat::D32_SFLOAT: {
       std::vector<float> depth_data(map_width * map_height);
       for (size_t index = 0; index < depth_data.size(); ++index) {
-        depth_data[index] = index * 4 / 255.f;
+        depth_data[index] = index * 4 / 255.F;
       }
       EXPECT_EQ(cuMemcpyHtoD(
                     depth_ptr.get(), depth_data.data(), depth_data.size() * depth_component_size),
@@ -471,18 +471,18 @@ INSTANTIATE_TEST_SUITE_P(GeometryLayer, DepthMapRenderMode,
                                                           viz::ImageFormat::D32_SFLOAT)));
 
 TEST_F(GeometryLayer, Reuse) {
-  std::vector<float> data{0.5f, 0.5f};
+  std::vector<float> data{0.5F, 0.5F};
 
   for (uint32_t i = 0; i < 2; ++i) {
     EXPECT_NO_THROW(viz::Begin());
 
     EXPECT_NO_THROW(viz::BeginGeometryLayer());
-    EXPECT_NO_THROW(viz::Color(0.1f, 0.2f, 0.3f, 0.4f));
-    EXPECT_NO_THROW(viz::LineWidth(2.f));
-    EXPECT_NO_THROW(viz::PointSize(3.f));
+    EXPECT_NO_THROW(viz::Color(0.1F, 0.2F, 0.3F, 0.4F));
+    EXPECT_NO_THROW(viz::LineWidth(2.F));
+    EXPECT_NO_THROW(viz::PointSize(3.F));
     EXPECT_NO_THROW(
         viz::Primitive(viz::PrimitiveTopology::POINT_LIST, 1, data.size(), data.data()));
-    EXPECT_NO_THROW(viz::Text(0.4f, 0.4f, 0.1f, "Text"));
+    EXPECT_NO_THROW(viz::Text(0.4F, 0.4F, 0.1F, "Text"));
     EXPECT_NO_THROW(viz::EndLayer());
 
     EXPECT_NO_THROW(viz::End());
@@ -490,17 +490,17 @@ TEST_F(GeometryLayer, Reuse) {
 }
 
 TEST_F(GeometryLayer, Errors) {
-  std::vector<float> data{0.5f, 0.5f};
+  std::vector<float> data{0.5F, 0.5F};
 
   EXPECT_NO_THROW(viz::Begin());
 
   // it's an error to call geometry functions without calling BeginGeometryLayer first
-  EXPECT_THROW(viz::Color(0.f, 0.f, 0.f, 1.f), std::runtime_error);
-  EXPECT_THROW(viz::LineWidth(1.0f), std::runtime_error);
-  EXPECT_THROW(viz::PointSize(1.0f), std::runtime_error);
+  EXPECT_THROW(viz::Color(0.F, 0.F, 0.F, 1.F), std::runtime_error);
+  EXPECT_THROW(viz::LineWidth(1.0F), std::runtime_error);
+  EXPECT_THROW(viz::PointSize(1.0F), std::runtime_error);
   EXPECT_THROW(viz::Primitive(viz::PrimitiveTopology::POINT_LIST, 1, data.size(), data.data()),
                std::runtime_error);
-  EXPECT_THROW(viz::Text(0.5f, 0.5f, 0.1f, "Text"), std::runtime_error);
+  EXPECT_THROW(viz::Text(0.5F, 0.5F, 0.1F, "Text"), std::runtime_error);
 
   // it's an error to call BeginGeometryLayer again without calling EndLayer
   EXPECT_NO_THROW(viz::BeginGeometryLayer());
@@ -509,12 +509,12 @@ TEST_F(GeometryLayer, Errors) {
 
   // it's an error to call geometry functions when a different layer is active
   EXPECT_NO_THROW(viz::BeginImageLayer());
-  EXPECT_THROW(viz::Color(0.f, 0.f, 0.f, 1.f), std::runtime_error);
-  EXPECT_THROW(viz::LineWidth(1.0f), std::runtime_error);
-  EXPECT_THROW(viz::PointSize(1.0f), std::runtime_error);
+  EXPECT_THROW(viz::Color(0.F, 0.F, 0.F, 1.F), std::runtime_error);
+  EXPECT_THROW(viz::LineWidth(1.0F), std::runtime_error);
+  EXPECT_THROW(viz::PointSize(1.0F), std::runtime_error);
   EXPECT_THROW(viz::Primitive(viz::PrimitiveTopology::POINT_LIST, 1, data.size(), data.data()),
                std::runtime_error);
-  EXPECT_THROW(viz::Text(0.5f, 0.5f, 0.1f, "Text"), std::runtime_error);
+  EXPECT_THROW(viz::Text(0.5F, 0.5F, 0.1F, "Text"), std::runtime_error);
   EXPECT_NO_THROW(viz::EndLayer());
 
   EXPECT_NO_THROW(viz::BeginGeometryLayer());
@@ -537,7 +537,7 @@ TEST_F(GeometryLayer, Errors) {
   };
 
   for (auto&& cur : required) {
-    std::vector<float> data(cur.values, 0.f);
+    std::vector<float> data(cur.values, 0.F);
     // Primitive function errors, first call the passing function
     EXPECT_NO_THROW(viz::Primitive(cur.topology, 1, data.size(), data.data()));
     // it's an error to call Primitive with a data size which is too small for the primitive count
@@ -555,11 +555,11 @@ TEST_F(GeometryLayer, Errors) {
                std::invalid_argument);
 
   // Text function errors, first call the passing function
-  EXPECT_NO_THROW(viz::Text(0.5f, 0.5f, 0.1f, "Text"));
+  EXPECT_NO_THROW(viz::Text(0.5F, 0.5F, 0.1F, "Text"));
   // it's an error to call Text with a size of zero
-  EXPECT_THROW(viz::Text(0.5f, 0.5f, 0.0f, "Text"), std::invalid_argument);
+  EXPECT_THROW(viz::Text(0.5F, 0.5F, 0.0F, "Text"), std::invalid_argument);
   // it's an error to call Text with null text pointer
-  EXPECT_THROW(viz::Text(0.5f, 0.5f, 0.1f, nullptr), std::invalid_argument);
+  EXPECT_THROW(viz::Text(0.5F, 0.5F, 0.1F, nullptr), std::invalid_argument);
 
   // Depth map function errors, first call the passing function
   const uint32_t map_width = 8;
diff --git a/modules/holoviz/tests/functional/im_gui_layer_test.cpp b/modules/holoviz/tests/functional/im_gui_layer_test.cpp
index 4a0c1fc8..dff15386 100644
--- a/modules/holoviz/tests/functional/im_gui_layer_test.cpp
+++ b/modules/holoviz/tests/functional/im_gui_layer_test.cpp
@@ -52,7 +52,7 @@ TEST_F(ImGuiLayer, Window) {
 }
 
 TEST_F(ImGuiLayer, Errors) {
-  std::vector<float> data{0.5f, 0.5f};
+  std::vector<float> data{0.5F, 0.5F};
 
   EXPECT_NO_THROW(viz::Begin());
 
diff --git a/modules/holoviz/tests/functional/image_layer_test.cpp b/modules/holoviz/tests/functional/image_layer_test.cpp
index b70f2e69..b2c63b16 100644
--- a/modules/holoviz/tests/functional/image_layer_test.cpp
+++ b/modules/holoviz/tests/functional/image_layer_test.cpp
@@ -296,43 +296,43 @@ TEST_P(ImageLayer, Image) {
     for (uint32_t y = 0; y < height_; ++y) {
       for (uint32_t x = 0; x < width_; ++x) {
         // RGB -> YUV conversion
-        const float r = color_data_[y * (width_ * 3) + x * 3 + 0] / 255.f;
-        const float g = color_data_[y * (width_ * 3) + x * 3 + 1] / 255.f;
-        const float b = color_data_[y * (width_ * 3) + x * 3 + 2] / 255.f;
+        const float r = color_data_[y * (width_ * 3) + x * 3 + 0] / 255.F;
+        const float g = color_data_[y * (width_ * 3) + x * 3 + 1] / 255.F;
+        const float b = color_data_[y * (width_ * 3) + x * 3 + 2] / 255.F;
         float Kr, Kg, Kb;
         switch (yuv_model_conversion) {
           case viz::YuvModelConversion::YUV_601:
-            Kr = 0.299f;
-            Kb = 0.114f;
+            Kr = 0.299F;
+            Kb = 0.114F;
             break;
           case viz::YuvModelConversion::YUV_709:
-            Kb = 0.0722f;
-            Kr = 0.2126f;
+            Kb = 0.0722F;
+            Kr = 0.2126F;
             break;
           case viz::YuvModelConversion::YUV_2020:
-            Kb = 0.0593f;
-            Kr = 0.2627f;
+            Kb = 0.0593F;
+            Kr = 0.2627F;
             break;
           default:
             ASSERT_TRUE(false) << "Unhandled yuv model conversion";
             break;
         }
-        // since Kr + Kg + Kb = 1.f, calculate Kg
-        Kg = 1.f - Kb - Kr;
+        // since Kr + Kg + Kb = 1.F, calculate Kg
+        Kg = 1.F - Kb - Kr;
 
         float luma = Kr * r + Kg * g + Kb * b;  // 0 ... 1
-        float u = (b - luma) / (1.f - Kb);      // -1 ... 1
-        float v = (r - luma) / (1.f - Kr);      // -1 ... 1
+        float u = (b - luma) / (1.F - Kb);      // -1 ... 1
+        float v = (r - luma) / (1.F - Kr);      // -1 ... 1
 
         switch (yuv_range) {
           case viz::YuvRange::ITU_FULL:
-            u = u * 0.5f + 0.5f;
-            v = v * 0.5f + 0.5f;
+            u = u * 0.5F + 0.5F;
+            v = v * 0.5F + 0.5F;
             break;
           case viz::YuvRange::ITU_NARROW:
-            luma = 16.f / 255.f + luma * (219.f / 255.f);
-            u = 128.f / 255.f + u * 0.5f * (224.f / 255.f);
-            v = 128.f / 255.f + v * 0.5f * (224.f / 255.f);
+            luma = 16.F / 255.F + luma * (219.F / 255.F);
+            u = 128.F / 255.F + u * 0.5F * (224.F / 255.F);
+            v = 128.F / 255.F + v * 0.5F * (224.F / 255.F);
             break;
           default:
             ASSERT_TRUE(false) << "Unhandled yuv range";
@@ -341,105 +341,105 @@ TEST_P(ImageLayer, Image) {
 
         switch (image_format) {
           case viz::ImageFormat::Y8U8Y8V8_422_UNORM:
-            converted_data[y * (width_ * 2) + x * 2] = uint8_t(luma * 255.f + 0.5f);
+            converted_data[y * (width_ * 2) + x * 2] = uint8_t(luma * 255.F + 0.5F);
             if ((x & 1) == 0) {
-              converted_data[y * (width_ * 2) + (x * 2) + 1] = uint8_t(u * 255.f + 0.5f);
-              converted_data[y * (width_ * 2) + (x * 2) + 3] = uint8_t(v * 255.f + 0.5f);
+              converted_data[y * (width_ * 2) + (x * 2) + 1] = uint8_t(u * 255.F + 0.5F);
+              converted_data[y * (width_ * 2) + (x * 2) + 3] = uint8_t(v * 255.F + 0.5F);
             }
             break;
           case viz::ImageFormat::U8Y8V8Y8_422_UNORM:
-            converted_data[y * (width_ * 2) + (x * 2) + 1] = uint8_t(luma * 255.f + 0.5f);
+            converted_data[y * (width_ * 2) + (x * 2) + 1] = uint8_t(luma * 255.F + 0.5F);
             if ((x & 1) == 0) {
-              converted_data[y * (width_ * 2) + (x * 2) + 0] = uint8_t(u * 255.f + 0.5f);
-              converted_data[y * (width_ * 2) + (x * 2) + 2] = uint8_t(v * 255.f + 0.5f);
+              converted_data[y * (width_ * 2) + (x * 2) + 0] = uint8_t(u * 255.F + 0.5F);
+              converted_data[y * (width_ * 2) + (x * 2) + 2] = uint8_t(v * 255.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y8_U8V8_2PLANE_420_UNORM:
-            converted_data[y * width_ + x] = uint8_t(luma * 255.f + 0.5f);
+            converted_data[y * width_ + x] = uint8_t(luma * 255.F + 0.5F);
             if (((x & 1) == 0) && ((y & 1) == 0)) {
               converted_data[offset_plane_1 + ((y / 2) * (width_ / 2) + (x / 2)) * 2 + 0] =
-                  uint8_t(u * 255.f + 0.5f);
+                  uint8_t(u * 255.F + 0.5F);
               converted_data[offset_plane_1 + ((y / 2) * (width_ / 2) + (x / 2)) * 2 + 1] =
-                  uint8_t(v * 255.f + 0.5f);
+                  uint8_t(v * 255.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y8_U8V8_2PLANE_422_UNORM:
-            converted_data[y * width_ + x] = uint8_t(luma * 255.f + 0.5f);
+            converted_data[y * width_ + x] = uint8_t(luma * 255.F + 0.5F);
             if ((x & 1) == 0) {
               converted_data[offset_plane_1 + (y * (width_ / 2) + (x / 2)) * 2 + 0] =
-                  uint8_t(u * 255.f + 0.5f);
+                  uint8_t(u * 255.F + 0.5F);
               converted_data[offset_plane_1 + (y * (width_ / 2) + (x / 2)) * 2 + 1] =
-                  uint8_t(v * 255.f + 0.5f);
+                  uint8_t(v * 255.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y8_U8_V8_3PLANE_420_UNORM:
-            converted_data[y * width_ + x] = uint8_t(luma * 255.f + 0.5f);
+            converted_data[y * width_ + x] = uint8_t(luma * 255.F + 0.5F);
             if (((x & 1) == 0) && ((y & 1) == 0)) {
               converted_data[offset_plane_1 + (y / 2) * (width_ / 2) + (x / 2)] =
-                  uint8_t(u * 255.f + 0.5f);
+                  uint8_t(u * 255.F + 0.5F);
               converted_data[offset_plane_2 + (y / 2) * (width_ / 2) + (x / 2)] =
-                  uint8_t(v * 255.f + 0.5f);
+                  uint8_t(v * 255.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y8_U8_V8_3PLANE_422_UNORM:
-            converted_data[y * width_ + x] = uint8_t(luma * 255.f + 0.5f);
+            converted_data[y * width_ + x] = uint8_t(luma * 255.F + 0.5F);
             if ((x & 1) == 0) {
               converted_data[offset_plane_1 + y * (width_ / 2) + (x / 2)] =
-                  uint8_t(u * 255.f + 0.5f);
+                  uint8_t(u * 255.F + 0.5F);
               converted_data[offset_plane_2 + y * (width_ / 2) + (x / 2)] =
-                  uint8_t(v * 255.f + 0.5f);
+                  uint8_t(v * 255.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y16_U16V16_2PLANE_420_UNORM:
             reinterpret_cast<uint16_t*>(converted_data.data())[y * width_ + x] =
-                uint16_t(luma * 65535.f + 0.5f);
+                uint16_t(luma * 65535.F + 0.5F);
             if (((x & 1) == 0) && ((y & 1) == 0)) {
               reinterpret_cast<uint16_t*>(
                   converted_data.data())[offset_plane_1 / sizeof(uint16_t) +
                                          ((y / 2) * (width_ / 2) + (x / 2)) * 2 + 0] =
-                  uint16_t(u * 65535.f + 0.5f);
+                  uint16_t(u * 65535.F + 0.5F);
               reinterpret_cast<uint16_t*>(
                   converted_data.data())[offset_plane_1 / sizeof(uint16_t) +
                                          ((y / 2) * (width_ / 2) + (x / 2)) * 2 + 1] =
-                  uint16_t(v * 65535.f + 0.5f);
+                  uint16_t(v * 65535.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y16_U16V16_2PLANE_422_UNORM:
             reinterpret_cast<uint16_t*>(converted_data.data())[y * width_ + x] =
-                uint16_t(luma * 65535.f + 0.5f);
+                uint16_t(luma * 65535.F + 0.5F);
             if ((x & 1) == 0) {
               reinterpret_cast<uint16_t*>(
                   converted_data.data())[offset_plane_1 / sizeof(uint16_t) +
                                          (y * (width_ / 2) + (x / 2)) * 2 + 0] =
-                  uint16_t(u * 65535.f + 0.5f);
+                  uint16_t(u * 65535.F + 0.5F);
               reinterpret_cast<uint16_t*>(
                   converted_data.data())[offset_plane_1 / sizeof(uint16_t) +
                                          (y * (width_ / 2) + (x / 2)) * 2 + 1] =
-                  uint16_t(v * 65535.f + 0.5f);
+                  uint16_t(v * 65535.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y16_U16_V16_3PLANE_420_UNORM:
             reinterpret_cast<uint16_t*>(converted_data.data())[y * width_ + x] =
-                uint16_t(luma * 65535.f + 0.5f);
+                uint16_t(luma * 65535.F + 0.5F);
             if (((x & 1) == 0) && ((y & 1) == 0)) {
               reinterpret_cast<uint16_t*>(converted_data.data())[offset_plane_1 / sizeof(uint16_t) +
                                                                  (y / 2) * (width_ / 2) + (x / 2)] =
-                  uint16_t(u * 65535.f + 0.5f);
+                  uint16_t(u * 65535.F + 0.5F);
               reinterpret_cast<uint16_t*>(converted_data.data())[offset_plane_2 / sizeof(uint16_t) +
                                                                  (y / 2) * (width_ / 2) + (x / 2)] =
-                  uint16_t(v * 65535.f + 0.5f);
+                  uint16_t(v * 65535.F + 0.5F);
             }
             break;
           case viz::ImageFormat::Y16_U16_V16_3PLANE_422_UNORM:
             reinterpret_cast<uint16_t*>(converted_data.data())[y * width_ + x] =
-                uint16_t(luma * 65535.f + 0.5f);
+                uint16_t(luma * 65535.F + 0.5F);
             if ((x & 1) == 0) {
               reinterpret_cast<uint16_t*>(converted_data.data())[offset_plane_1 / sizeof(uint16_t) +
                                                                  y * (width_ / 2) + (x / 2)] =
-                  uint16_t(u * 65535.f + 0.5f);
+                  uint16_t(u * 65535.F + 0.5F);
               reinterpret_cast<uint16_t*>(converted_data.data())[offset_plane_2 / sizeof(uint16_t) +
                                                                  y * (width_ / 2) + (x / 2)] =
-                  uint16_t(v * 65535.f + 0.5f);
+                  uint16_t(v * 65535.F + 0.5F);
             }
             break;
           default:
@@ -452,12 +452,12 @@ TEST_P(ImageLayer, Image) {
     std::swap(color_data_, converted_data);
 
     depth_format = viz::ImageFormat::D32_SFLOAT;
-    depth_data_ = std::vector<float>(width_ * height_ * 1 * sizeof(float), 0.f);
+    depth_data_ = std::vector<float>(width_ * height_ * 1 * sizeof(float), 0.F);
   } else {
     color_format = image_format;
     depth_format = viz::ImageFormat::D32_SFLOAT;
     SetupData(image_format);
-    depth_data_ = std::vector<float>(width_ * height_ * 1 * sizeof(float), 0.f);
+    depth_data_ = std::vector<float>(width_ * height_ * 1 * sizeof(float), 0.F);
   }
 
   std::vector<uint32_t> lut;
@@ -522,7 +522,7 @@ TEST_P(ImageLayer, Image) {
             break;
           case viz::ImageFormat::R32_SFLOAT:
             lut_index =
-                static_cast<uint32_t>(reinterpret_cast<float*>(color_data_.data())[index] + 0.5f);
+                static_cast<uint32_t>(reinterpret_cast<float*>(color_data_.data())[index] + 0.5F);
             break;
           default:
             ASSERT_TRUE(false) << "Unhandled LUT image format";
@@ -539,19 +539,19 @@ TEST_P(ImageLayer, Image) {
             offset = color_data_[index];
             break;
           case viz::ImageFormat::R8_UNORM:
-            offset = (static_cast<float>(color_data_[index]) / 255.f) * (lut_size_ - 1);
+            offset = (static_cast<float>(color_data_[index]) / 255.F) * (lut_size_ - 1);
             break;
           case viz::ImageFormat::R8_SNORM:
-            offset = (static_cast<float>(color_data_[index]) / 127.f) * (lut_size_ - 1);
+            offset = (static_cast<float>(color_data_[index]) / 127.F) * (lut_size_ - 1);
             break;
           case viz::ImageFormat::R16_UNORM:
             offset = (static_cast<float>(reinterpret_cast<uint16_t*>(color_data_.data())[index]) /
-                      65535.f) *
+                      65535.F) *
                      (lut_size_ - 1);
             break;
           case viz::ImageFormat::R16_SNORM:
             offset = (static_cast<float>(reinterpret_cast<int16_t*>(color_data_.data())[index]) /
-                      32767.f) *
+                      32767.F) *
                      (lut_size_ - 1);
             break;
           case viz::ImageFormat::R32_SFLOAT:
@@ -567,7 +567,7 @@ TEST_P(ImageLayer, Image) {
         const uint32_t val1 = lut[std::max(
             0,
             std::min(int32_t(lut_size_) - 1,
-                     int32_t((offset + (1.0f / float(lut_size_))) * (lut_size_ - 1))))];
+                     int32_t((offset + (1.0F / float(lut_size_))) * (lut_size_ - 1))))];
         float dummy;
         const float frac = std::modf(offset, &dummy);
 
@@ -587,8 +587,8 @@ TEST_P(ImageLayer, Image) {
         const float a = a0 + frac * (a1 - a0);
 
         reinterpret_cast<uint32_t*>(converted_data.data())[index] =
-            uint32_t(r + 0.5f) | (uint32_t(g + 0.5f) << 8) | (uint32_t(b + 0.5f) << 16) |
-            (uint32_t(a + 0.5f) << 24);
+            uint32_t(r + 0.5F) | (uint32_t(g + 0.5F) << 8) | (uint32_t(b + 0.5F) << 16) |
+            (uint32_t(a + 0.5F) << 24);
       }
     }
   } else if (convert_color) {
@@ -630,23 +630,23 @@ TEST_P(ImageLayer, Image) {
         case viz::ImageFormat::R8G8B8_SNORM:
         case viz::ImageFormat::R8G8B8A8_SNORM:
           converted_data[index] = uint8_t(
-              (float(reinterpret_cast<int8_t*>(color_data_.data())[index]) / 127.f) * 255.f + 0.5f);
+              (float(reinterpret_cast<int8_t*>(color_data_.data())[index]) / 127.F) * 255.F + 0.5F);
           break;
         case viz::ImageFormat::R16_UNORM:
         case viz::ImageFormat::R16G16B16A16_UNORM:
           converted_data[index] = uint8_t(
-              (float(reinterpret_cast<uint16_t*>(color_data_.data())[index]) / 65535.f) * 255.f +
-              0.5f);
+              (float(reinterpret_cast<uint16_t*>(color_data_.data())[index]) / 65535.F) * 255.F +
+              0.5F);
           break;
         case viz::ImageFormat::R16_SNORM:
         case viz::ImageFormat::R16G16B16A16_SNORM:
           converted_data[index] = uint8_t(
-              (float(reinterpret_cast<int16_t*>(color_data_.data())[index]) / 32767.f) * 255.f +
-              0.5f);
+              (float(reinterpret_cast<int16_t*>(color_data_.data())[index]) / 32767.F) * 255.F +
+              0.5F);
           break;
         case viz::ImageFormat::R32_SFLOAT:
           converted_data[index] =
-              uint8_t(reinterpret_cast<float*>(color_data_.data())[index] * 255.f + 0.5f);
+              uint8_t(reinterpret_cast<float*>(color_data_.data())[index] * 255.F + 0.5F);
           break;
         case viz::ImageFormat::R8_SRGB:
         case viz::ImageFormat::R8G8B8_SRGB:
@@ -665,20 +665,20 @@ TEST_P(ImageLayer, Image) {
           const uint32_t value = reinterpret_cast<uint32_t*>(color_data_.data())[index / 4];
           const uint32_t component = index % 4;
           if (component == 3) {
-            converted_data[index] = uint8_t((float(value >> 30) / 3.f) * 255.f + 0.5f);
+            converted_data[index] = uint8_t((float(value >> 30) / 3.F) * 255.F + 0.5F);
           } else {
             converted_data[index] =
-                uint8_t((float((value >> (component * 10)) & 0x3FF) / 1023.f) * 255.f + 0.5f);
+                uint8_t((float((value >> (component * 10)) & 0x3FF) / 1023.F) * 255.F + 0.5F);
           }
         } break;
         case viz::ImageFormat::A2R10G10B10_UNORM_PACK32: {
           const uint32_t value = reinterpret_cast<uint32_t*>(color_data_.data())[index / 4];
           const uint32_t component = index % 4;
           if (component == 3) {
-            converted_data[index] = uint8_t((float(value >> 30) / 3.f) * 255.f + 0.5f);
+            converted_data[index] = uint8_t((float(value >> 30) / 3.F) * 255.F + 0.5F);
           } else {
             converted_data[index] =
-                uint8_t((float((value >> ((2 - component) * 10)) & 0x3FF) / 1023.f) * 255.f + 0.5f);
+                uint8_t((float((value >> ((2 - component) * 10)) & 0x3FF) / 1023.F) * 255.F + 0.5F);
           }
         } break;
         default:
@@ -696,13 +696,13 @@ TEST_P(ImageLayer, Image) {
       case viz::ImageFormat::A8B8G8R8_SRGB_PACK32:
         for (size_t index = 0; index < elements; index += components) {
           for (size_t component = 0; component < components; ++component) {
-            float value = float(converted_data[index + component]) / 255.f;
-            if (value < 0.04045f) {
-              value /= 12.92f;
+            float value = float(converted_data[index + component]) / 255.F;
+            if (value < 0.04045F) {
+              value /= 12.92F;
             } else {
-              value = std::pow(((value + 0.055f) / 1.055f), 2.4f);
+              value = std::pow(((value + 0.055F) / 1.055F), 2.4F);
             }
-            converted_data[index + component] = uint8_t((value * 255.f) + 0.5f);
+            converted_data[index + component] = uint8_t((value * 255.F) + 0.5F);
           }
         }
         break;
@@ -1207,10 +1207,10 @@ TEST_P(ImageLayerSwizzle, Swizzle) {
       }
     }
     if (pixel[3] != 0xFF) {
-      float alpha = float(pixel[3]) / 255.f;
-      color_data_[index * 4 + 0] = uint8_t(float(pixel[0]) * alpha + 0.5f);
-      color_data_[index * 4 + 1] = uint8_t(float(pixel[1]) * alpha + 0.5f);
-      color_data_[index * 4 + 2] = uint8_t(float(pixel[2]) * alpha + 0.5f);
+      float alpha = float(pixel[3]) / 255.F;
+      color_data_[index * 4 + 0] = uint8_t(float(pixel[0]) * alpha + 0.5F);
+      color_data_[index * 4 + 1] = uint8_t(float(pixel[1]) * alpha + 0.5F);
+      color_data_[index * 4 + 2] = uint8_t(float(pixel[2]) * alpha + 0.5F);
       color_data_[index * 4 + 3] = pixel[3];
     } else {
       color_data_[index * 4 + 0] = pixel[0];
diff --git a/modules/holoviz/tests/functional/init_test.cpp b/modules/holoviz/tests/functional/init_test.cpp
index c6b937aa..60632086 100644
--- a/modules/holoviz/tests/functional/init_test.cpp
+++ b/modules/holoviz/tests/functional/init_test.cpp
@@ -131,7 +131,7 @@ TEST(Init, VulkanLoaderFail) {
 
 TEST(Init, Errors) {
   // should thrown when specifying an invalid font file
-  EXPECT_NO_THROW(viz::SetFont("NonExistingFile.ttf", 12.f));
+  EXPECT_NO_THROW(viz::SetFont("NonExistingFile.ttf", 12.F));
   EXPECT_THROW(viz::Init(128, 64, "Holoviz test", viz::InitFlags::HEADLESS), std::runtime_error);
   EXPECT_NO_THROW(viz::Shutdown());
 
diff --git a/modules/holoviz/tests/functional/layer_test.cpp b/modules/holoviz/tests/functional/layer_test.cpp
index 311a0db4..9f56972a 100644
--- a/modules/holoviz/tests/functional/layer_test.cpp
+++ b/modules/holoviz/tests/functional/layer_test.cpp
@@ -29,7 +29,7 @@ class Layer : public TestHeadless {};
 
 TEST_F(Layer, Opacity) {
   const viz::ImageFormat kFormat = viz::ImageFormat::R8G8B8A8_UNORM;
-  const float opacity = 0.4f;
+  const float opacity = 0.4F;
 
   SetupData(kFormat);
 
@@ -38,7 +38,7 @@ TEST_F(Layer, Opacity) {
   data_with_opacity.resize(width_ * height_ * sizeof(uint32_t));
   for (size_t index = 0; index < width_ * height_ * 4; ++index) {
     data_with_opacity.data()[index] =
-        uint8_t((static_cast<float>(color_data_[index]) / 255.f * opacity) * 255.f + 0.5f);
+        uint8_t((static_cast<float>(color_data_[index]) / 255.F * opacity) * 255.F + 0.5F);
   }
 
   EXPECT_NO_THROW(viz::Begin());
@@ -88,7 +88,7 @@ TEST_F(Layer, Priority) {
 
     std::vector<float> depth_data;
     ReadDepthData(depth_data);
-    EXPECT_EQ(depth_data.data()[0], 0.f);
+    EXPECT_EQ(depth_data.data()[0], 0.F);
   }
 }
 
@@ -103,13 +103,13 @@ TEST_F(Layer, View) {
 
   // top left - red image
   EXPECT_NO_THROW(viz::BeginImageLayer());
-  EXPECT_NO_THROW(viz::LayerAddView(0.f, 0.f, 0.5f, 0.5f));
+  EXPECT_NO_THROW(viz::LayerAddView(0.F, 0.F, 0.5F, 0.5F));
   EXPECT_NO_THROW(viz::ImageHost(1, 1, kFormat, reinterpret_cast<const void*>(&red)));
   EXPECT_NO_THROW(viz::EndLayer());
 
   // top right - green image
   EXPECT_NO_THROW(viz::BeginImageLayer());
-  EXPECT_NO_THROW(viz::LayerAddView(0.5f, 0.0f, 0.5f, 0.5f));
+  EXPECT_NO_THROW(viz::LayerAddView(0.5F, 0.0F, 0.5F, 0.5F));
   EXPECT_NO_THROW(viz::ImageHost(1, 1, kFormat, reinterpret_cast<const void*>(&green)));
   EXPECT_NO_THROW(viz::EndLayer());
 
@@ -118,11 +118,11 @@ TEST_F(Layer, View) {
   // - bottom right, half size - blue triangles
   constexpr uint32_t triangles = 2;
   std::array<float, triangles * 3 * 2> data{
-      0.f, 0.f, 1.f, 0.f, 1.f, 1.f, 0.f, 0.f, 1.f, 1.f, 0.f, 1.f};
+      0.F, 0.F, 1.F, 0.F, 1.F, 1.F, 0.F, 0.F, 1.F, 1.F, 0.F, 1.F};
   EXPECT_NO_THROW(viz::BeginGeometryLayer());
-  EXPECT_NO_THROW(viz::LayerAddView(0.f, 0.5f, 0.5f, .5f));
-  EXPECT_NO_THROW(viz::LayerAddView(0.625f, 0.625f, 0.25f, .25f));
-  EXPECT_NO_THROW(viz::Color(0.f, 0.f, 1.f, 1.f));
+  EXPECT_NO_THROW(viz::LayerAddView(0.F, 0.5F, 0.5F, .5F));
+  EXPECT_NO_THROW(viz::LayerAddView(0.625F, 0.625F, 0.25F, .25F));
+  EXPECT_NO_THROW(viz::Color(0.F, 0.F, 1.F, 1.F));
   EXPECT_NO_THROW(
       viz::Primitive(viz::PrimitiveTopology::TRIANGLE_LIST, triangles, data.size(), data.data()));
   EXPECT_NO_THROW(viz::EndLayer());
@@ -154,26 +154,26 @@ TEST_F(Layer, Errors) {
   EXPECT_THROW(viz::EndLayer(), std::runtime_error);
 
   // it's an error to call layer functions without an active layer
-  EXPECT_THROW(viz::LayerOpacity(1.f), std::runtime_error);
+  EXPECT_THROW(viz::LayerOpacity(1.F), std::runtime_error);
   EXPECT_THROW(viz::LayerPriority(0), std::runtime_error);
-  EXPECT_THROW(viz::LayerAddView(0.f, 0.f, 1.f, 1.f), std::runtime_error);
+  EXPECT_THROW(viz::LayerAddView(0.F, 0.F, 1.F, 1.F), std::runtime_error);
 
   EXPECT_NO_THROW(viz::Begin());
   EXPECT_NO_THROW(viz::BeginImageLayer());
 
   // passing case
-  EXPECT_NO_THROW(viz::LayerOpacity(1.0f));
+  EXPECT_NO_THROW(viz::LayerOpacity(1.0F));
   // it's an error to set negative opacity
-  EXPECT_THROW(viz::LayerOpacity(-0.1f), std::invalid_argument);
+  EXPECT_THROW(viz::LayerOpacity(-0.1F), std::invalid_argument);
   // it's an error to set opacity higher than 1.0
-  EXPECT_THROW(viz::LayerOpacity(1.1f), std::invalid_argument);
+  EXPECT_THROW(viz::LayerOpacity(1.1F), std::invalid_argument);
 
   // passing case
-  EXPECT_NO_THROW(viz::LayerAddView(0.f, 0.f, 1.f, 1.f));
+  EXPECT_NO_THROW(viz::LayerAddView(0.F, 0.F, 1.F, 1.F));
   // it's an error to add a layer view with zero width
-  EXPECT_THROW(viz::LayerAddView(0.f, 0.f, 0.f, 1.f), std::invalid_argument);
+  EXPECT_THROW(viz::LayerAddView(0.F, 0.F, 0.F, 1.F), std::invalid_argument);
   // it's an error to add a layer view with zero height
-  EXPECT_THROW(viz::LayerAddView(0.f, 0.f, 1.f, 0.f), std::invalid_argument);
+  EXPECT_THROW(viz::LayerAddView(0.F, 0.F, 1.F, 0.F), std::invalid_argument);
 
   EXPECT_NO_THROW(viz::EndLayer());
   EXPECT_NO_THROW(viz::End());
diff --git a/modules/holoviz/tests/functional/test_fixture.cpp b/modules/holoviz/tests/functional/test_fixture.cpp
index 8e215599..43f2c881 100644
--- a/modules/holoviz/tests/functional/test_fixture.cpp
+++ b/modules/holoviz/tests/functional/test_fixture.cpp
@@ -138,7 +138,7 @@ void TestBase::SetupData(viz::ImageFormat format, uint32_t rand_seed) {
       channels = 1;
       component_size = sizeof(float);
       color_data_.resize(width_ * height_ * channels * component_size);
-      Fill<float>(color_data_.data(), width_ * height_, 0.f, float(lut_size_ - 1));
+      Fill<float>(color_data_.data(), width_ * height_, 0.F, float(lut_size_ - 1));
       break;
     case viz::ImageFormat::R8G8B8_UNORM:
     case viz::ImageFormat::R8G8B8_SRGB:
@@ -199,7 +199,7 @@ void TestBase::SetupData(viz::ImageFormat format, uint32_t rand_seed) {
       channels = 1;
       component_size = sizeof(float);
       depth_data_.resize(width_ * height_ * channels * component_size);
-      Fill<float>(depth_data_.data(), width_ * height_, 0.f, std::numeric_limits<float>::max());
+      Fill<float>(depth_data_.data(), width_ * height_, 0.F, std::numeric_limits<float>::max());
       break;
     case viz::ImageFormat::A2B10G10R10_UNORM_PACK32:
     case viz::ImageFormat::A2R10G10B10_UNORM_PACK32:
@@ -325,11 +325,11 @@ bool TestBase::CompareDepthResult() {
       // convert to single channel uint8_t assuming depth is between 0...1
       std::vector<uint8_t> image_data(depth_data_.size());
       for (size_t index = 0; index < depth_data_.size(); ++index) {
-        image_data[index] = static_cast<uint8_t>(depth_data_[index] * 255.f + 0.5f);
+        image_data[index] = static_cast<uint8_t>(depth_data_[index] * 255.F + 0.5F);
       }
       stbi_write_png(ref_file_name.c_str(), width_, height_, 1, image_data.data(), 0);
       for (size_t index = 0; index < depth_data.size(); ++index) {
-        image_data[index] = static_cast<uint8_t>(depth_data[index] * 255.f + 0.5f);
+        image_data[index] = static_cast<uint8_t>(depth_data[index] * 255.F + 0.5F);
       }
       stbi_write_png(fail_file_name.c_str(), width_, height_, 1, image_data.data(), 0);
 
@@ -415,7 +415,7 @@ bool TestBase::CompareDepthResultCRC32(const std::vector<uint32_t> crc32) {
     // convert to single channel uint8_t assuming depth is between 0...1
     std::vector<uint8_t> image_data(read_data.size());
     for (size_t index = 0; index < read_data.size(); ++index) {
-      image_data[index] = static_cast<uint8_t>(read_data[index] * 255.f + 0.5f);
+      image_data[index] = static_cast<uint8_t>(read_data[index] * 255.F + 0.5F);
     }
 
     stbi_write_png(image_file_name.c_str(), width_, height_, 1, image_data.data(), 0);
diff --git a/modules/holoviz/tests/functional/vsync_test.cpp b/modules/holoviz/tests/functional/vsync_test.cpp
index 0a825670..2feb8912 100644
--- a/modules/holoviz/tests/functional/vsync_test.cpp
+++ b/modules/holoviz/tests/functional/vsync_test.cpp
@@ -88,18 +88,18 @@ TEST_P(VSync, Modes) {
   auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
       std::chrono::steady_clock::now() - start);
 
-  const float displayed_frames = float(video_mode->refreshRate) * elapsed.count() / 1000.f;
+  const float displayed_frames = float(video_mode->refreshRate) * elapsed.count() / 1000.F;
 
   switch (present_mode) {
     case viz::PresentMode::FIFO:
       // rendered frames should be within 10% of displayed frames
-      EXPECT_LE(std::abs((float(frames) / displayed_frames) - 1.f), 0.1f);
+      EXPECT_LE(std::abs((float(frames) / displayed_frames) - 1.F), 0.1F);
       break;
     case viz::PresentMode::AUTO:
     case viz::PresentMode::IMMEDIATE:
     case viz::PresentMode::MAILBOX:
       // no vsync, should render at least two times the refresh rate
-      EXPECT_GT(frames, displayed_frames * 2.f);
+      EXPECT_GT(frames, displayed_frames * 2.F);
       break;
   }
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.cpp b/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.cpp
index becee614..63e1c8d4 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.cpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.cpp
@@ -27,7 +27,7 @@ namespace nvh {
 
 inline float sign(float s)
 {
-  return (s < 0.f) ? -1.f : 1.f;
+  return (s < 0.F) ? -1.F : 1.F;
 }
 
 //--------------------------------------------------------------------------------------------------
@@ -78,7 +78,7 @@ void CameraManipulator::setLookat(const nvmath::vec3f& eye, const nvmath::vec3f&
 //   over time.
 void CameraManipulator::updateAnim()
 {
-  auto elapse = static_cast<float>(getSystemTime() - m_start_time) / 1000.f;
+  auto elapse = static_cast<float>(getSystemTime() - m_start_time) / 1000.F;
 
   // Key animation
   if(m_key_vec != nvmath::vec3f(0, 0, 0))
@@ -94,10 +94,10 @@ void CameraManipulator::updateAnim()
   if(m_anim_done)
     return;
 
-  float t = std::min(elapse / float(m_duration), 1.0f);
+  float t = std::min(elapse / float(m_duration), 1.0F);
   // Evaluate polynomial (smoother step from Perlin)
-  t = t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
-  if(t >= 1.0f)
+  t = t * t * t * (t * (t * 6.0F - 15.0F) + 10.0F);
+  if(t >= 1.0F)
   {
     m_current   = m_goal;
     m_anim_done = true;
@@ -233,8 +233,8 @@ void CameraManipulator::keyMotion(float dx, float dy, int action)
   }
 
   auto d = nvmath::normalize(m_current.ctr - m_current.eye);
-  dx *= m_speed * 2.f;
-  dy *= m_speed * 2.f;
+  dx *= m_speed * 2.F;
+  dy *= m_speed * 2.F;
 
   nvmath::vec3f key_vec;
   if(action == Dolly)
@@ -321,12 +321,12 @@ void CameraManipulator::wheel(int value, const Inputs& inputs)
 // Set and clamp FOV between 0.01 and 179 degrees
 void CameraManipulator::setFov(float _fov)
 {
-  m_current.fov = std::min(std::max(_fov, 0.01f), 179.0f);
+  m_current.fov = std::min(std::max(_fov, 0.01F), 179.0F);
 }
 
 nvmath::vec3f CameraManipulator::computeBezier(float t, nvmath::vec3f& p0, nvmath::vec3f& p1, nvmath::vec3f& p2)
 {
-  float u  = 1.f - t;
+  float u  = 1.F - t;
   float tt = t * t;
   float uu = u * u;
 
@@ -344,15 +344,15 @@ void CameraManipulator::findBezierPoints()
   nvmath::vec3f p1, pc;
 
   // point of interest
-  nvmath::vec3f pi = (m_goal.ctr + m_current.ctr) * 0.5f;
+  nvmath::vec3f pi = (m_goal.ctr + m_current.ctr) * 0.5F;
 
-  nvmath::vec3f p02    = (p0 + p2) * 0.5f;                            // mid p0-p2
-  float         radius = (length(p0 - pi) + length(p2 - pi)) * 0.5f;  // Radius for p1
+  nvmath::vec3f p02    = (p0 + p2) * 0.5F;                            // mid p0-p2
+  float         radius = (length(p0 - pi) + length(p2 - pi)) * 0.5F;  // Radius for p1
   nvmath::vec3f p02pi(p02 - pi);                                      // Vector from interest to mid point
   p02pi.normalize();
   p02pi *= radius;
   pc   = pi + p02pi;                        // Calculated point to go through
-  p1   = 2.f * pc - p0 * 0.5f - p2 * 0.5f;  // Computing p1 for t=0.5
+  p1   = 2.F * pc - p0 * 0.5F - p2 * 0.5F;  // Computing p1 for t=0.5
   p1.y = p02.y;                             // Clamping the P1 to be in the same height as p0-p2
 
   m_bezier[0] = p0;
@@ -372,7 +372,7 @@ void CameraManipulator::pan(float dx, float dy)
   }
 
   nvmath::vec3f z(m_current.eye - m_current.ctr);
-  float         length = static_cast<float>(nvmath::length(z)) / 0.785f;  // 45 degrees
+  float         length = static_cast<float>(nvmath::length(z)) / 0.785F;  // 45 degrees
   z                    = nvmath::normalize(z);
   nvmath::vec3f x      = nvmath::cross(m_current.up, z);
   x                    = nvmath::normalize(x);
@@ -453,7 +453,7 @@ void CameraManipulator::dolly(float dx, float dy)
   float         length = static_cast<float>(nvmath::length(z));
 
   // We are at the point of interest, and don't know any direction, so do nothing!
-  if(length < 0.000001f)
+  if(length < 0.000001F)
     return;
 
   // Use the larger movement.
@@ -468,7 +468,7 @@ void CameraManipulator::dolly(float dx, float dy)
   if(m_mode == Examine)
   {
     // Don't move over the point of interest.
-    if(factor >= 1.0f)
+    if(factor >= 1.0F)
       return;
 
     z *= factor;
@@ -476,7 +476,7 @@ void CameraManipulator::dolly(float dx, float dy)
   else
   {
     // Normalize the Z vector and make it faster
-    z *= factor / length * 10.0f;
+    z *= factor / length * 10.0F;
   }
 
   // Not going up
@@ -533,7 +533,7 @@ const std::string& CameraManipulator::getHelp()
 //
 void CameraManipulator::fit(const nvmath::vec3f& boxMin, const nvmath::vec3f& boxMax, bool instantFit /*= true*/, bool tight /*=false*/, float aspect /*=1.0f*/)
 {
-  const nvmath::vec3f boxHalfSize = (boxMax - boxMin) * .5f;
+  const nvmath::vec3f boxHalfSize = (boxMax - boxMin) * .5F;
   const nvmath::vec3f boxCenter   = boxMin + boxHalfSize;
 
   float offset = 0;
@@ -544,10 +544,10 @@ void CameraManipulator::fit(const nvmath::vec3f& boxMin, const nvmath::vec3f& bo
   {
     // Using the bounding sphere
     float radius = nvmath::length(boxHalfSize);
-    if(aspect > 1.f)
-      offset = radius / sin(nv_to_rad * yfov * 0.5f);
+    if(aspect > 1.F)
+      offset = radius / sin(nv_to_rad * yfov * 0.5F);
     else
-      offset = radius / sin(nv_to_rad * xfov * 0.5f);
+      offset = radius / sin(nv_to_rad * xfov * 0.5F);
   }
   else
   {
@@ -563,8 +563,8 @@ void CameraManipulator::fit(const nvmath::vec3f& boxMin, const nvmath::vec3f& bo
       if(vct.z < 0)  // Take only points in front of the center
       {
         // Keep the largest offset to see that vertex
-        offset = std::max(fabs(vct.y) / tan(nv_to_rad * yfov * 0.5f) + fabs(vct.z), offset);
-        offset = std::max(fabs(vct.x) / tan(nv_to_rad * xfov * 0.5f) + fabs(vct.z), offset);
+        offset = std::max(fabs(vct.y) / tan(nv_to_rad * yfov * 0.5F) + fabs(vct.z), offset);
+        offset = std::max(fabs(vct.x) / tan(nv_to_rad * xfov * 0.5F) + fabs(vct.z), offset);
       }
     }
   }
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.hpp b/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.hpp
index fbd0c4f4..08b8330c 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvh/cameramanipulator.hpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 //--------------------------------------------------------------------
@@ -90,7 +90,7 @@ class CameraManipulator
     nvmath::vec3f eye = nvmath::vec3f(10, 10, 10);
     nvmath::vec3f ctr = nvmath::vec3f(0, 0, 0);
     nvmath::vec3f up  = nvmath::vec3f(0, 1, 0);
-    float         fov = 60.0f;
+    float         fov = 60.0F;
 
     bool operator!=(const Camera& rhr) const
     {
@@ -140,7 +140,7 @@ class CameraManipulator
   // Set the position, interest from the matrix.
   // instantSet = true will not interpolate to the new position
   // centerDistance is the distance of the center from the eye
-  void setMatrix(const nvmath::mat4f& mat_, bool instantSet = true, float centerDistance = 1.f);
+  void setMatrix(const nvmath::mat4f& mat_, bool instantSet = true, float centerDistance = 1.F);
 
   // Changing the default speed movement
   void setSpeed(float speed) { m_speed = speed; }
@@ -181,7 +181,7 @@ class CameraManipulator
   const std::string& getHelp();
 
   // Fitting the camera position and interest to see the bounding box
-  void fit(const nvmath::vec3f& boxMin, const nvmath::vec3f& boxMax, bool instantFit = true, bool tight = false, float aspect = 1.0f);
+  void fit(const nvmath::vec3f& boxMin, const nvmath::vec3f& boxMax, bool instantFit = true, bool tight = false, float aspect = 1.0F);
 
 protected:
   CameraManipulator();
@@ -222,13 +222,13 @@ class CameraManipulator
   int m_height = 1;
 
   // Other
-  float         m_speed      = 3.f;
-  nvmath::vec2f m_mouse      = nvmath::vec2f(0.f, 0.f);
-  nvmath::vec2f m_clipPlanes = nvmath::vec2f(0.001f, 100000000.f);
+  float         m_speed      = 3.F;
+  nvmath::vec2f m_mouse      = nvmath::vec2f(0.F, 0.F);
+  nvmath::vec2f m_clipPlanes = nvmath::vec2f(0.001F, 100000000.F);
 
   bool  m_button = false;  // Button pressed
   bool  m_moving = false;  // Mouse is moving
-  float m_tbsize = 0.8f;   // Trackball size;
+  float m_tbsize = 0.8F;   // Trackball size;
 
   Modes m_mode = Examine;
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath.inl b/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath.inl
index 3dcd72b3..0efc1ba7 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath.inl
+++ b/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath.inl
@@ -1347,8 +1347,8 @@ inline void quaternion<T>::from_matrix(const matrix3<T>& mat)
     int k     = next[j];
     T   scale = sqrtf(mat(i, i) - mat(j, j) - mat(k, k) + 1);
     T*  q[]   = {&x, &y, &z};
-    *q[i]     = 0.5f * scale;
-    scale     = 0.5f / scale;
+    *q[i]     = 0.5F * scale;
+    scale     = 0.5F / scale;
     w         = scale * (mat(k, j) - mat(j, k));
     *q[j]     = scale * (mat(j, i) + mat(i, j));
     *q[k]     = scale * (mat(k, i) + mat(i, k));
@@ -1380,8 +1380,8 @@ inline void quaternion<T>::from_matrix(const matrix4<T>& mat)
     int k     = next[j];
     T   scale = sqrtf(mat(i, i) - mat(j, j) - mat(k, k) + 1);
     T*  q[]   = {&x, &y, &z};
-    *q[i]     = 0.5f * scale;
-    scale     = 0.5f / scale;
+    *q[i]     = 0.5F * scale;
+    scale     = 0.5F / scale;
     w         = scale * (mat(k, j) - mat(j, k));
     *q[j]     = scale * (mat(j, i) + mat(i, j));
     *q[k]     = scale * (mat(k, i) + mat(i, k));
@@ -1432,19 +1432,19 @@ inline void quaternion<T>::to_matrix(matrix4<T>& mat) const
   mat(0, 0) = 1 - (yy + zz);
   mat(0, 1) = xy - wz;
   mat(0, 2) = xz + wy;
-  mat(0, 3) = 0.0f;
+  mat(0, 3) = 0.0F;
   mat(1, 0) = xy + wz;
   mat(1, 1) = 1 - (xx + zz);
   mat(1, 2) = yz - wx;
-  mat(1, 3) = 0.0f;
+  mat(1, 3) = 0.0F;
   mat(2, 0) = xz - wy;
   mat(2, 1) = yz + wx;
   mat(2, 2) = 1 - (xx + yy);
-  mat(2, 3) = 0.0f;
-  mat(3, 0) = 0.0f;
-  mat(3, 1) = 0.0f;
-  mat(3, 2) = 0.0f;
-  mat(3, 3) = 1.0f;
+  mat(2, 3) = 0.0F;
+  mat(3, 0) = 0.0F;
+  mat(3, 1) = 0.0F;
+  mat(3, 2) = 0.0F;
+  mat(3, 3) = 1.0F;
 }
 
 template <class T>
@@ -1567,8 +1567,8 @@ inline quaternion<T> slerp_quats(T s, const quaternion<T>& q1, const quaternion<
     return p;
   }
   T sine    = sinf(angle);
-  T sineInv = 1.0f / sine;
-  T c1      = sinf((1.0f - s) * angle) * sineInv;
+  T sineInv = 1.0F / sine;
+  T c1      = sinf((1.0F - s) * angle) * sineInv;
   T c2      = sinf(s * angle) * sineInv;
   p.x       = c1 * q1.x + c2 * q2.x;
   p.y       = c1 * q1.y + c2 * q2.y;
@@ -2237,10 +2237,10 @@ template <class T>
 inline T get_angle(const vector3<T>& v1, const vector3<T>& v2)
 {
   float dp = dot(v1, v2);
-  if(dp > 1.0f)
-    dp = 1.0f;
-  else if(dp < -1.0f)
-    dp = -1.0f;
+  if(dp > 1.0F)
+    dp = 1.0F;
+  else if(dp < -1.0F)
+    dp = -1.0F;
   return acosf(dp);
 }
 
@@ -2254,8 +2254,8 @@ inline vector3<T> rotate_by(const vector3<T>& src, const quaternion<T>& q)
 template <class T>
 inline void quaternion<T>::to_euler_xyz(vector3<T>& r)
 {
-  double a = 2.0f * (w * x + y * z);
-  double b = 1.0 - 2.0f * (x * x + y * y);
+  double a = 2.0F * (w * x + y * z);
+  double b = 1.0 - 2.0F * (x * x + y * y);
   r.x      = (T)atan2(a, b);
 
   a   = 2.0 * (w * y - z * x);
@@ -2269,8 +2269,8 @@ inline void quaternion<T>::to_euler_xyz(vector3<T>& r)
 template <class T>
 inline void quaternion<T>::to_euler_xyz(T* r)
 {
-  double a = 2.0f * (w * x + y * z);
-  double b = 1.0 - 2.0f * (x * x + y * y);
+  double a = 2.0F * (w * x + y * z);
+  double b = 1.0 - 2.0F * (x * x + y * y);
   r[0]     = (T)atan2(a, b);
 
   a    = 2.0 * (w * y - z * x);
@@ -2291,7 +2291,7 @@ inline quaternion<T>::quaternion(const vector3<T>& eulerXYZ)
 template <class T>
 inline void quaternion<T>::from_euler_xyz(vector3<T> r)
 {
-  r *= 0.5f;
+  r *= 0.5F;
   w = cosf(r.x) * cosf(r.y) * cosf(r.z) + sinf(r.x) * sinf(r.y) * sinf(r.z);
   x = sinf(r.x) * cosf(r.y) * cosf(r.z) - cosf(r.x) * sinf(r.y) * sinf(r.z);
   y = cosf(r.x) * sinf(r.y) * cosf(r.z) + sinf(r.x) * cosf(r.y) * sinf(r.z);
@@ -2641,7 +2641,7 @@ inline vector3<T> project_point_on_plane(const vector3<T>& point, const plane<T>
 template <typename T>
 inline void normalize_plane(plane<T>& p)
 {
-  float inv_length = 1.0f / length(p.normal());
+  float inv_length = 1.0F / length(p.normal());
   p *= inv_length;
 }
 
@@ -2712,17 +2712,17 @@ vector3<T> get_perpendicular_vec(const vector3<T>& vec)
   // choose a basis vector roughly along the smallest component of the vector
   if(perp.x <= perp.y && perp.x <= perp.z)
   {
-    perp = vector3<T>(1.0f, 0, 0);
+    perp = vector3<T>(1.0F, 0, 0);
   }
   else
   {
     if(perp.y <= perp.x && perp.y <= perp.z)
     {
-      perp = vector3<T>(0, 1.0f, 0);
+      perp = vector3<T>(0, 1.0F, 0);
     }
     else
     {
-      perp = vector3<T>(0, 0, 1.0f);
+      perp = vector3<T>(0, 0, 1.0F);
     }
   }
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath_types.h b/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath_types.h
index e3182ec8..f2692f7f 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath_types.h
+++ b/modules/holoviz/thirdparty/nvpro_core/nvmath/nvmath_types.h
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2002-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2002-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -211,7 +211,7 @@ struct vector3
   vector3(const vector2<T>& u)
       : x(u.x)
       , y(u.y)
-      , z(1.0f)
+      , z(1.0F)
   {
   }
   vector3(const vector2<T>& u, T v)
@@ -391,15 +391,15 @@ struct vector4
   vector4(const vector2<T>& u)
       : x(u.x)
       , y(u.y)
-      , z(0.0f)
-      , w(1.0f)
+      , z(0.0F)
+      , w(1.0F)
   {
   }
   vector4(const vector2<T>& u, const T zz)
       : x(u.x)
       , y(u.y)
       , z(zz)
-      , w(1.0f)
+      , w(1.0F)
   {
   }
   vector4(const vector2<T>& u, const T zz, const T ww)
@@ -413,7 +413,7 @@ struct vector4
       : x(u.x)
       , y(u.y)
       , z(u.z)
-      , w(1.0f)
+      , w(1.0F)
   {
   }
   vector4(const vector3<T>& u, const T w)
@@ -715,11 +715,11 @@ struct matrix4
     memcpy(mat_array + 4, M.mat_array + 3, sizeof(T) * 3);
     mat_array[7] = 0.0;
     memcpy(mat_array + 8, M.mat_array + 6, sizeof(T) * 3);
-    mat_array[11] = 0.0f;
-    mat_array[12] = 0.0f;
-    mat_array[13] = 0.0f;
-    mat_array[14] = 0.0f;
-    mat_array[15] = 1.0f;
+    mat_array[11] = 0.0F;
+    mat_array[12] = 0.0F;
+    mat_array[13] = 0.0F;
+    mat_array[14] = 0.0F;
+    mat_array[15] = 1.0F;
   }
   matrix4(const matrix4<T>& M) { memcpy(mat_array, M.mat_array, sizeof(T) * 16); }
 
@@ -858,10 +858,10 @@ struct matrix4
     return *this;
   }
 
-  //TL: some additional methods that look like OpenGL...
-  // they behave the same as the OpenGL matrix system
-  // But: using vector3<T> class; rotation is in Radians
-  // TODO: optimize
+  // TL: some additional methods that look like OpenGL...
+  //  they behave the same as the OpenGL matrix system
+  //  But: using vector3<T> class; rotation is in Radians
+  //  TODO(unknown): optimize
   matrix4<T>& identity()
   {
     mat_array[0]  = T(1);
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvp/NvFoundation.h b/modules/holoviz/thirdparty/nvpro_core/nvp/NvFoundation.h
index dbc9c47c..c2ce367e 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvp/NvFoundation.h
+++ b/modules/holoviz/thirdparty/nvpro_core/nvp/NvFoundation.h
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
  //--------------------------------------------------------------------
@@ -474,7 +474,7 @@ NV_COMPILE_TIME_ASSERT(NV_OFFSET_OF(NvPackValidation, a) == 8);
 
 #define    NV_MAX_REAL                    NV_MAX_F32
 #define NV_EPS_REAL                    NV_EPS_F32
-#define NV_NORMALIZATION_EPSILON    float(1e-20f)
+#define NV_NORMALIZATION_EPSILON    float(1e-20F)
 
 /** enum for empty constructor tag*/
 enum NvEMPTY            {    NvEmpty        };
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.cpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.cpp
index b014fe45..4038b7cd 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.cpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.cpp
@@ -338,7 +338,7 @@ void Context::initQueueList(QueueScoreList& list, const uint32_t* maxFamilyCount
   for(uint32_t qF = 0; qF < m_physicalInfo.queueProperties.size(); ++qF)
   {
     const auto& queueFamily = m_physicalInfo.queueProperties[qF];
-    QueueScore  score{0, qF, 0, 1.0f};
+    QueueScore  score{0, qF, 0, 1.0F};
 
     for(uint32_t i = 0; i < 32; i++)
     {
@@ -448,8 +448,8 @@ bool Context::initDevice(uint32_t deviceIndex, const ContextCreateInfo& info)
       // handle each request individually
       for(uint32_t i = 0; i < it.count; i++)
       {
-        // in this pass we don't care about the real priority yet, queueList is initialized with 1.0f
-        QueueScore queue = removeQueueListItem(queueScoresTemp, it.requiredFlags, 1.0f);
+        // in this pass we don't care about the real priority yet, queueList is initialized with 1.0F
+        QueueScore queue = removeQueueListItem(queueScoresTemp, it.requiredFlags, 1.0F);
         if(!queue.score)
         {
           // there were not enough queues left supporting the required flags
@@ -848,7 +848,7 @@ void ContextCreateInfo::removeDeviceExtension(const char* name)
   }
 }
 
-void ContextCreateInfo::addRequestedQueue(VkQueueFlags flags, uint32_t count /*= 1*/, float priority /*= 1.0f*/)
+void ContextCreateInfo::addRequestedQueue(VkQueueFlags flags, uint32_t count /*= 1*/, float priority /*= 1.0F*/)
 {
   requestedQueues.push_back({flags, count, priority});
 }
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.hpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.hpp
index ae65e49e..ccfc4c0d 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/context_vk.hpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -107,7 +107,7 @@ struct ContextCreateInfo
   // by default-constructor three queues are requested,
   // if you want more/different setups manipulate the requestedQueues vector
   // or use this function.
-  void addRequestedQueue(VkQueueFlags flags, uint32_t count = 1, float priority = 1.0f);
+  void addRequestedQueue(VkQueueFlags flags, uint32_t count = 1, float priority = 1.0F);
 
   // Configure additional device creation with these variables and functions
 
@@ -183,9 +183,9 @@ struct ContextCreateInfo
   VkQueueFlags defaultQueueGCT    = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
   VkQueueFlags defaultQueueT      = VK_QUEUE_TRANSFER_BIT;
   VkQueueFlags defaultQueueC      = VK_QUEUE_COMPUTE_BIT;
-  float        defaultPriorityGCT = 1.0f;
-  float        defaultPriorityT   = 1.0f;
-  float        defaultPriorityC   = 1.0f;
+  float        defaultPriorityGCT = 1.0F;
+  float        defaultPriorityT   = 1.0F;
+  float        defaultPriorityC   = 1.0F;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -363,7 +363,7 @@ class Context
     VkQueue  queue       = VK_NULL_HANDLE;
     uint32_t familyIndex = ~0;
     uint32_t queueIndex  = ~0;
-    float    priority    = 1.0f;
+    float    priority    = 1.0F;
 
     operator VkQueue() const { return queue; }
     operator uint32_t() const { return familyIndex; }
@@ -384,7 +384,7 @@ class Context
 
   // additional queues must be created once through this function
   // returns new Queue and pops entry from available Queues that were requested via info.requestedQueues
-  Queue createQueue(VkQueueFlags requiredFlags, const std::string& debugName, float priority = 1.0f);
+  Queue createQueue(VkQueueFlags requiredFlags, const std::string& debugName, float priority = 1.0F);
 
   operator VkDevice() const { return m_device; }
 
@@ -424,7 +424,7 @@ class Context
     uint32_t score       = 0;  // the lower the score, the more 'specialized' it is
     uint32_t familyIndex = ~0;
     uint32_t queueIndex  = ~0;
-    float    priority    = 1.0f;
+    float    priority    = 1.0F;
   };
   using QueueScoreList = std::vector<QueueScore>;
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/debug_util_vk.hpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/debug_util_vk.hpp
index 78ebf2a7..11b16eb8 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/debug_util_vk.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/debug_util_vk.hpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -94,7 +94,7 @@ class DebugUtil
   {
     if(s_enabled)
     {
-      VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0f, 1.0f, 1.0f, 1.0f}};
+      VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0F, 1.0F, 1.0F, 1.0F}};
       vkCmdBeginDebugUtilsLabelEXT(cmdBuf, &s);
     }
   }
@@ -109,7 +109,7 @@ class DebugUtil
   {
     if(s_enabled)
     {
-      VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0f, 1.0f, 1.0f, 1.0f}};
+      VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0F, 1.0F, 1.0F, 1.0F}};
       vkCmdInsertDebugUtilsLabelEXT(cmdBuf, &s);
     }
   }
@@ -123,7 +123,7 @@ class DebugUtil
     {
       if(s_enabled)
       {
-        VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0f, 1.0f, 1.0f, 1.0f}};
+        VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0F, 1.0F, 1.0F, 1.0F}};
         vkCmdBeginDebugUtilsLabelEXT(cmdBuf, &s);
       }
     }
@@ -138,7 +138,7 @@ class DebugUtil
     {
       if(s_enabled)
       {
-        VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0f, 1.0f, 1.0f, 1.0f}};
+        VkDebugUtilsLabelEXT s{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, nullptr, label.c_str(), {1.0F, 1.0F, 1.0F, 1.0F}};
         vkCmdInsertDebugUtilsLabelEXT(m_cmdBuf, &s);
       }
     }
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.cpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.cpp
index 94935a34..138fe5e8 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.cpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.cpp
@@ -104,7 +104,7 @@ MemAllocateInfo& MemAllocateInfo::setTilingOptimal(bool isTilingOptimal) {
   return *this;
 }
 
-MemAllocateInfo& MemAllocateInfo::setPriority(const float priority /*= 0.5f*/) {
+MemAllocateInfo& MemAllocateInfo::setPriority(const float priority /*= 0.5F*/) {
   m_priority = priority;
   return *this;
 }
@@ -122,7 +122,7 @@ uint32_t getMemoryType(const VkPhysicalDeviceMemoryProperties& memoryProperties,
     }
   }
   assert(0);
-  return ~0u;
+  return ~0U;
 }
 
 bool fillBakedAllocateInfo(const VkPhysicalDeviceMemoryProperties& physMemProps,
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.hpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.hpp
index 24708ea5..0f2d645b 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/memallocator_vk.hpp
@@ -90,7 +90,7 @@ class MemAllocateInfo {
   // Make the allocation exportable
   MemAllocateInfo& setExportable(bool exportable);
   // Prioritize the allocation (values 0.0 - 1.0); this may guide eviction strategies
-  MemAllocateInfo& setPriority(const float priority = 0.5f);
+  MemAllocateInfo& setPriority(const float priority = 0.5F);
 
   VkImage getDedicatedImage() const { return m_dedicatedImage; }
   VkBuffer getDedicatedBuffer() const { return m_dedicatedBuffer; }
@@ -110,7 +110,7 @@ class MemAllocateInfo {
   uint32_t m_deviceMask{0};
   std::vector<VkMemoryRequirements> m_memReqs;
   VkMemoryPropertyFlags m_memProps{0};
-  float m_priority{0.5f};
+  float m_priority{0.5F};
 
   std::string m_debugName;
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.cpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.cpp
index c5ab9ae1..ac0c8fa8 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.cpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.cpp
@@ -259,7 +259,7 @@ void DeviceMemoryAllocator::destroyID(AllocationID id) {
   m_freeAllocationIndex = id.index;
 }
 
-const float DeviceMemoryAllocator::DEFAULT_PRIORITY = 0.5f;
+const float DeviceMemoryAllocator::DEFAULT_PRIORITY = 0.5F;
 
 void DeviceMemoryAllocator::init(VkDevice device, VkPhysicalDevice physicalDevice,
                                  VkDeviceSize blockSize, VkDeviceSize maxSize) {
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.hpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.hpp
index 07b495ba..6a6847c6 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/memorymanagement_vk.hpp
@@ -446,7 +446,7 @@ class DeviceMemoryAllocator : public MemAllocator
     bool                  isLinear    = false;
     bool                  isDedicated = false;
     bool                  isFirst     = false;  // first memory block of a type
-    float                 priority    = 0.0f;
+    float                 priority    = 0.0F;
     VkMemoryAllocateFlags allocateFlags{};
     uint32_t              allocateDeviceMask = 0;
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/pipeline_vk.hpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/pipeline_vk.hpp
index 0464202b..647b7d1f 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/pipeline_vk.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/pipeline_vk.hpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -85,7 +85,7 @@ struct GraphicsPipelineState
     rasterizationState.depthBiasConstantFactor = {};
     rasterizationState.depthBiasClamp          = {};
     rasterizationState.depthBiasSlopeFactor    = {};
-    rasterizationState.lineWidth               = 1.f;
+    rasterizationState.lineWidth               = 1.F;
 
     inputAssemblyState.flags = {};
     setValue(inputAssemblyState.topology, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
@@ -99,7 +99,7 @@ struct GraphicsPipelineState
     colorBlendState.pAttachments    = {};
     for(int i = 0; i < 4; i++)
     {
-      colorBlendState.blendConstants[i] = 0.f;
+      colorBlendState.blendConstants[i] = 0.F;
     }
 
 
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/resourceallocator_vk.cpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/resourceallocator_vk.cpp
index f05bf7a3..7183a91c 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/resourceallocator_vk.cpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/resourceallocator_vk.cpp
@@ -382,7 +382,7 @@ uint32_t ResourceAllocator::getMemoryType(uint32_t typeBits,
     }
   }
   assert(0);
-  return ~0u;
+  return ~0U;
 }
 
 AccelNV ResourceAllocator::createAcceleration(VkAccelerationStructureCreateInfoNV& accel_) {
diff --git a/modules/holoviz/thirdparty/nvpro_core/nvvk/samplers_vk.hpp b/modules/holoviz/thirdparty/nvpro_core/nvvk/samplers_vk.hpp
index 40b32ccf..29c26034 100644
--- a/modules/holoviz/thirdparty/nvpro_core/nvvk/samplers_vk.hpp
+++ b/modules/holoviz/thirdparty/nvpro_core/nvvk/samplers_vk.hpp
@@ -137,9 +137,9 @@ VkSamplerCreateInfo makeSamplerCreateInfo(VkFilter             magFilter
                                           VkBool32             anisotropyEnable = VK_FALSE,
                                           float                maxAnisotropy    = 16,
                                           VkSamplerMipmapMode  mipmapMode       = VK_SAMPLER_MIPMAP_MODE_LINEAR,
-                                          float                minLod           = 0.0f,
+                                          float                minLod           = 0.0F,
                                           float                maxLod           = FLT_MAX,
-                                          float                mipLodBias       = 0.0f,
+                                          float                mipLodBias       = 0.0F,
                                           VkBool32             compareEnable    = VK_FALSE,
                                           VkCompareOp          compareOp        = VK_COMPARE_OP_ALWAYS,
                                           VkBorderColor        borderColor      = VK_BORDER_COLOR_INT_OPAQUE_BLACK,
@@ -154,9 +154,9 @@ inline vk::SamplerCreateInfo makeSamplerCreateInfo(vk::Filter             magFil
                                                    vk::Bool32             anisotropyEnable = VK_FALSE,
                                                    float                  maxAnisotropy    = 16,
                                                    vk::SamplerMipmapMode mipmapMode    = vk::SamplerMipmapMode::eLinear,
-                                                   float                 minLod        = 0.0f,
+                                                   float                 minLod        = 0.0F,
                                                    float                 maxLod        = FLT_MAX,
-                                                   float                 mipLodBias    = 0.0f,
+                                                   float                 mipLodBias    = 0.0F,
                                                    vk::Bool32            compareEnable = VK_FALSE,
                                                    vk::CompareOp         compareOp     = vk::CompareOp::eAlways,
                                                    vk::BorderColor       borderColor = vk::BorderColor::eIntOpaqueBlack,
diff --git a/patches/README.md b/patches/README.md
index 1595eeba..c48d8110 100644
--- a/patches/README.md
+++ b/patches/README.md
@@ -3,4 +3,10 @@
 This folder contains patches that are either applied during the build process of Holoscan SDK
 or have been used to build artifacts used by the SDK
 
-> NO PATCHES AT THIS TIME
+- `libtorch`:
+  - Inline ([Dockerfile](../Dockerfile)): Remove unused `kineto` references to silence warning
+  - `libtorch.Caffe2.cmake.patch`: Patches `Caffe2/public/cuda.cmake` to address configuration warning ([GitHub Issue#129777](https://github.com/pytorch/pytorch/issues/129777)):
+```
+CMake Warning at /opt/libtorch/2.5.0_24.08/share/cmake/Caffe2/public/cuda.cmake:143 (message):
+  Failed to compute shorthash for libnvrtc.so
+```
diff --git a/patches/libtorch.Caffe2.cmake.patch b/patches/libtorch.Caffe2.cmake.patch
new file mode 100644
index 00000000..e59532e0
--- /dev/null
+++ b/patches/libtorch.Caffe2.cmake.patch
@@ -0,0 +1,15 @@
+diff --git a/share/cmake/Caffe2/public/cuda.cmake b/share/cmake/Caffe2/public/cuda.cmake
+index 229e8b7..d01db0f 100644
+--- a/share/cmake/Caffe2/public/cuda.cmake
++++ b/share/cmake/Caffe2/public/cuda.cmake
+@@ -134,8 +134,9 @@ endif()
+ # find lbnvrtc.so
+ set(CUDA_NVRTC_LIB "${CUDA_nvrtc_LIBRARY}" CACHE FILEPATH "")
+ if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH)
++  find_package(Python COMPONENTS Interpreter REQUIRED)
+   execute_process(
+-    COMMAND Python::Interpreter -c
++    COMMAND ${Python_EXECUTABLE} -c
+     "import hashlib;hash=hashlib.sha256();hash.update(open('${CUDA_NVRTC_LIB}','rb').read());print(hash.hexdigest()[:8])"
+     RESULT_VARIABLE _retval
+     OUTPUT_VARIABLE CUDA_NVRTC_SHORTHASH)
diff --git a/pyproject.toml b/pyproject.toml
index e5ba532c..d682e36a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,7 @@
 
 [tool.codespell]
 skip = "_deps,build*,.cache,html,_build,_static,generated,latex,install*,.git,xml,vale"
-ignore-words-list = "nd,bu,dne,unexpect"
+ignore-words-list = "bu,dne,nd,thirdparty,unexpect"
 
 [tool.ruff]
 exclude = [
@@ -33,7 +33,7 @@ exclude = [
 ]
 line-length = 100
 fix = false  # don't automatically apply fixes (can override with ruff --fix)
-target-version = "py38"
+target-version = "py39"
 
 
 [tool.ruff.lint]
diff --git a/python/holoscan/cli/__main__.py b/python/holoscan/cli/__main__.py
index 7891c195..7d1feb12 100644
--- a/python/holoscan/cli/__main__.py
+++ b/python/holoscan/cli/__main__.py
@@ -21,7 +21,7 @@
 import logging.config
 import os
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from .common.enum_types import Platform, PlatformConfiguration
 
@@ -34,7 +34,7 @@
 LOG_CONFIG_FILENAME = "logging.json"
 
 
-def parse_args(argv: Optional[List[str]] = None) -> argparse.Namespace:
+def parse_args(argv: Optional[list[str]] = None) -> argparse.Namespace:
     from .packager.package_command import create_package_parser
     from .runner.run_command import create_run_parser
 
@@ -127,7 +127,7 @@ def set_up_logging(level: Optional[str], config_path: Union[str, Path] = LOG_CON
     logging.config.dictConfig(config_dict)
 
 
-def main(argv: Optional[List[str]] = None):
+def main(argv: Optional[list[str]] = None):
     args = parse_args(argv)
 
     set_up_logging(args.log_level)
diff --git a/python/holoscan/cli/common/argparse_types.py b/python/holoscan/cli/common/argparse_types.py
index 440532ff..4238520c 100644
--- a/python/holoscan/cli/common/argparse_types.py
+++ b/python/holoscan/cli/common/argparse_types.py
@@ -18,7 +18,6 @@
 import argparse
 import os
 from pathlib import Path
-from typing import List
 
 from .constants import SDK
 from .enum_types import Platform, PlatformConfiguration, SdkType
@@ -92,7 +91,7 @@ def valid_existing_path(path: str) -> Path:
     raise argparse.ArgumentTypeError(f"No such file/folder: '{file_path}'")
 
 
-def valid_platforms(platforms_str: str) -> List[Platform]:
+def valid_platforms(platforms_str: str) -> list[Platform]:
     """Helper type checking and type converting method for ArgumentParser.add_argument
     to convert platform strings to Platform enum if values are valid.
 
diff --git a/python/holoscan/cli/common/artifact_sources.py b/python/holoscan/cli/common/artifact_sources.py
index 266b12e8..e0693d57 100644
--- a/python/holoscan/cli/common/artifact_sources.py
+++ b/python/holoscan/cli/common/artifact_sources.py
@@ -17,7 +17,7 @@
 
 import json
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 import requests
 from packaging.version import Version
@@ -46,19 +46,20 @@ def __init__(self) -> None:
         )
         ArtifactSources.ManifestFileUrl = f"https://edge.urm.nvidia.com/artifactory/sw-holoscan-cli-generic/{ArtifactSources.HoloscanVersion}/artifacts.json"
         self._logger = logging.getLogger("common")
-        self._supported_holoscan_versions = ["2.4.0", "2.5.0"]
+        self._supported_holoscan_versions = ["2.6.0"]
 
     @property
-    def holoscan_versions(self) -> List[str]:
-        return self._supported_holoscan_versions
+    def holoscan_versions(self) -> list[str]:
+        # logic to dynamically fetch the supported versions
+        return self._supported_holoscan_versions  # for now, return the hardcoded value
 
     def base_image(self, version) -> str:
         return self._data[version][SdkType.Holoscan.value][ArtifactSources.SectionBaseImages]
 
-    def build_images(self, version) -> Dict[Any, str]:
+    def build_images(self, version) -> dict[Any, str]:
         return self._data[version][SdkType.Holoscan.value][ArtifactSources.SectionBuildImages]
 
-    def health_probe(self, version) -> Dict[Any, str]:
+    def health_probe(self, version) -> dict[Any, str]:
         return self._data[version][ArtifactSources.SectionHealthProbe]
 
     def load(self, uri: str):
diff --git a/python/holoscan/cli/common/dockerutils.py b/python/holoscan/cli/common/dockerutils.py
index 987aa030..dff8461f 100644
--- a/python/holoscan/cli/common/dockerutils.py
+++ b/python/holoscan/cli/common/dockerutils.py
@@ -22,7 +22,7 @@
 import re
 import subprocess
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from python_on_whales import docker
 
@@ -35,7 +35,7 @@
 logger = logging.getLogger("common")
 
 
-def parse_docker_image_name_and_tag(image_name: str) -> Tuple[Optional[str], Optional[str]]:
+def parse_docker_image_name_and_tag(image_name: str) -> tuple[Optional[str], Optional[str]]:
     """Parse a given Docker image name and tag.
 
     Args:
@@ -157,7 +157,7 @@ def docker_run(
     app_info: dict,
     pkg_info: dict,
     quiet: bool,
-    commands: List[str],
+    commands: list[str],
     network: str,
     network_interface: Optional[str],
     use_all_nics: bool,
@@ -166,7 +166,7 @@ def docker_run(
     render: bool,
     user: str,
     terminal: bool,
-    devices: List[str],
+    devices: list[str],
     platform_config: str,
     shared_memory_size: str = "1GB",
     is_root: bool = False,
diff --git a/python/holoscan/cli/common/exceptions.py b/python/holoscan/cli/common/exceptions.py
index 52bf8ff9..9ef94677 100644
--- a/python/holoscan/cli/common/exceptions.py
+++ b/python/holoscan/cli/common/exceptions.py
@@ -15,8 +15,6 @@
 limitations under the License.
 """  # noqa: E501
 
-from typing import List
-
 
 class HoloscanSdkError(Exception):
     """Base class for exceptions in this module."""
@@ -119,7 +117,7 @@ class UnmatchedDeviceError(HoloscanSdkError):
     """
     Raise when the shared memory value is invalid."""
 
-    def __init__(self, unmatched_devices: List[str], *args: object) -> None:
+    def __init__(self, unmatched_devices: list[str], *args: object) -> None:
         super().__init__(
             f"The following devices cannot be found in /dev/: {str.join(',', unmatched_devices)}"
         )
diff --git a/python/holoscan/cli/common/sdk_utils.py b/python/holoscan/cli/common/sdk_utils.py
index 25de9066..a0fec094 100644
--- a/python/holoscan/cli/common/sdk_utils.py
+++ b/python/holoscan/cli/common/sdk_utils.py
@@ -19,7 +19,7 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional
 
 from packaging.version import Version
 
@@ -59,7 +59,7 @@ def detect_sdk(sdk: Optional[SdkType] = None) -> SdkType:
 
 def detect_sdk_version(
     sdk: SdkType, artifact_sources: ArtifactSources, sdk_version: Optional[Version] = None
-) -> Tuple[str, Optional[str]]:
+) -> tuple[str, Optional[str]]:
     """
     Detects SDK version to use based on installed PyPI package or user input.
     For Holoscan SDK(Type), detect only the Holoscan version with optional user-provided version.
diff --git a/python/holoscan/cli/common/utils.py b/python/holoscan/cli/common/utils.py
index e579d020..b14cfe5c 100644
--- a/python/holoscan/cli/common/utils.py
+++ b/python/holoscan/cli/common/utils.py
@@ -19,7 +19,6 @@
 import logging
 import socket
 import subprocess
-from typing import List, Tuple
 
 import psutil
 from packaging import version
@@ -108,7 +107,7 @@ def compare_versions(version1, version2):
         return 0
 
 
-def get_host_ip_addresses() -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
+def get_host_ip_addresses() -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
     """
     Returns a tuple containing interface name and its IPv4 address as the first item
     and another item with interface name and its IPv6 address.
diff --git a/python/holoscan/cli/package-source.json b/python/holoscan/cli/package-source.json
index 503c885e..5364825c 100644
--- a/python/holoscan/cli/package-source.json
+++ b/python/holoscan/cli/package-source.json
@@ -1,11 +1,11 @@
 {
-  "2.4.0": {
+  "2.6.0": {
     "holoscan": {
-      "debian-version": "2.3.0.1-1",
-      "wheel-version": "2.3.0",
+      "debian-version": "2.6.0.1-1",
+      "wheel-version": "2.6.0",
       "base-images": {
-        "dgpu": "nvcr.io/nvidia/cuda:12.2.2-runtime-ubuntu22.04",
-        "igpu": "nvcr.io/nvidia/tensorrt:23.12-py3-igpu"
+        "dgpu": "nvcr.io/nvidia/cuda:12.6.0-runtime-ubuntu22.04",
+        "igpu": "nvcr.io/nvidia/tensorrt:24.08-py3-igpu"
       },
       "build-images": {
         "igpu": {
diff --git a/python/holoscan/cli/packager/arguments.py b/python/holoscan/cli/packager/arguments.py
index c244adcf..63c9a2e0 100644
--- a/python/holoscan/cli/packager/arguments.py
+++ b/python/holoscan/cli/packager/arguments.py
@@ -18,7 +18,6 @@
 import logging
 from argparse import Namespace
 from pathlib import Path
-from typing import List
 
 from ..common.artifact_sources import ArtifactSources
 from ..common.constants import DefaultValues
@@ -34,7 +33,7 @@ class PackagingArguments:
     """Processes input arguments for packager"""
 
     @property
-    def platforms(self) -> List[PlatformParameters]:
+    def platforms(self) -> list[PlatformParameters]:
         return self._platforms
 
     @property
@@ -56,7 +55,7 @@ def __init__(self, args: Namespace, temp_dir: str) -> None:
         """
         self._logger = logging.getLogger("packager")
 
-        self._platforms: List[PlatformParameters]
+        self._platforms: list[PlatformParameters]
         self._build_parameters = PackageBuildParameters()
         self._artifact_sources = ArtifactSources()
 
diff --git a/python/holoscan/cli/packager/manifest_files.py b/python/holoscan/cli/packager/manifest_files.py
index cca64ed0..eb3f24f3 100644
--- a/python/holoscan/cli/packager/manifest_files.py
+++ b/python/holoscan/cli/packager/manifest_files.py
@@ -16,7 +16,7 @@
 """  # noqa: E501
 
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any
 
 
 class ApplicationManifest:
@@ -52,19 +52,19 @@ def command(self, value: str):
         self._data["command"] = value
 
     @property
-    def environment(self) -> Dict[str, str]:
+    def environment(self) -> dict[str, str]:
         return self._data["environment"]
 
     @environment.setter
-    def environment(self, value: Dict[str, str]):
+    def environment(self, value: dict[str, str]):
         self._data["environment"] = value
 
     @property
-    def input(self) -> Dict[str, str]:  # noqa: A003
+    def input(self) -> dict[str, str]:  # noqa: A003
         return self._data["input"]
 
     @input.setter
-    def input(self, value: Dict[str, str]):  # noqa: A003
+    def input(self, value: dict[str, str]):  # noqa: A003
         self._data["input"] = value
 
     @property
@@ -76,11 +76,11 @@ def liveness(self, value: Any):
         self._data["liveness"] = value
 
     @property
-    def output(self) -> Dict[str, str]:
+    def output(self) -> dict[str, str]:
         return self._data["output"]
 
     @output.setter
-    def output(self, value: Dict[str, str]):
+    def output(self, value: dict[str, str]):
         self._data["output"] = value
 
     @property
@@ -135,7 +135,7 @@ def working_directory(self, value: str):
             self._data["workingDirectory"] = value
 
     @property
-    def data(self) -> Dict[str, Any]:
+    def data(self) -> dict[str, Any]:
         """Returns all values for serializing to JSON"""
         return self._data
 
@@ -190,11 +190,11 @@ def model_root(self, value: str):
             self._data["modelRoot"] = value
 
     @property
-    def models(self) -> Dict[str, str]:
+    def models(self) -> dict[str, str]:
         return self._data["models"]
 
     @models.setter
-    def models(self, value: Dict[str, str]):
+    def models(self, value: dict[str, str]):
         self._data["models"] = value
 
     @property
@@ -215,6 +215,6 @@ def version(self, value: str):
         self._data["version"] = value
 
     @property
-    def data(self) -> Dict[str, Any]:
+    def data(self) -> dict[str, Any]:
         """Returns all values for serializing to JSON"""
         return self._data
diff --git a/python/holoscan/cli/packager/models.py b/python/holoscan/cli/packager/models.py
index 7a4cde8e..f88f5db2 100644
--- a/python/holoscan/cli/packager/models.py
+++ b/python/holoscan/cli/packager/models.py
@@ -18,7 +18,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Optional
 
 logger = logging.getLogger("packager")
 
@@ -30,7 +30,7 @@ class Models:
     subdirectory.
     """
 
-    def build(self, models_path: Path) -> Optional[Dict[str, Path]]:
+    def build(self, models_path: Path) -> Optional[dict[str, Path]]:
         """Checks if the given path is a file or a directory.
 
         Args:
@@ -42,7 +42,7 @@ def build(self, models_path: Path) -> Optional[Dict[str, Path]]:
         """
         if models_path is not None:
             logger.info(f"Scanning for models in {models_path}...")
-            models: Dict[str, Path] = {}
+            models: dict[str, Path] = {}
             if models_path.is_file():
                 self._configure_model_file(models_path, models)
             elif models_path.is_dir():
@@ -52,7 +52,7 @@ def build(self, models_path: Path) -> Optional[Dict[str, Path]]:
         else:
             return None
 
-    def _configure_model_dir(self, models_path: Path, models: Dict[str, Path]):
+    def _configure_model_dir(self, models_path: Path, models: dict[str, Path]):
         """
         Iterate through the given directory to scan for models.
         If files are found within the directory, we simply assume that all files within the given
@@ -79,7 +79,7 @@ def _configure_model_dir(self, models_path: Path, models: Dict[str, Path]):
                 models[model_name] = model_path
                 logger.debug(f"Model {model_name}={model_path} added.")
 
-    def _configure_model_file(self, models_path: Path, models: Dict[str, Path]):
+    def _configure_model_file(self, models_path: Path, models: dict[str, Path]):
         """
         Adds a new model to 'models' object where the model name is the name of the given file,
         and the value is the path to the given model file.
diff --git a/python/holoscan/cli/packager/package_command.py b/python/holoscan/cli/packager/package_command.py
index 28db356f..67b06225 100644
--- a/python/holoscan/cli/packager/package_command.py
+++ b/python/holoscan/cli/packager/package_command.py
@@ -18,7 +18,6 @@
 import argparse
 import logging
 from argparse import ArgumentParser, _SubParsersAction
-from typing import List
 
 from packaging.version import Version
 
@@ -36,7 +35,7 @@
 
 
 def create_package_parser(
-    subparser: _SubParsersAction, command: str, parents: List[ArgumentParser]
+    subparser: _SubParsersAction, command: str, parents: list[ArgumentParser]
 ) -> ArgumentParser:
     parser: ArgumentParser = subparser.add_parser(
         command, formatter_class=argparse.HelpFormatter, parents=parents, add_help=False
diff --git a/python/holoscan/cli/packager/packager.py b/python/holoscan/cli/packager/packager.py
index 7b9f393b..2b57c8ff 100644
--- a/python/holoscan/cli/packager/packager.py
+++ b/python/holoscan/cli/packager/packager.py
@@ -20,7 +20,6 @@
 import os
 import tempfile
 from argparse import Namespace
-from typing import List
 
 from ..common.enum_types import ApplicationType
 from ..common.utils import print_manifest_json
@@ -32,7 +31,7 @@
 logger = logging.getLogger("packager")
 
 
-def _build_image(args: PackagingArguments, temp_dir: str) -> List[PlatformBuildResults]:
+def _build_image(args: PackagingArguments, temp_dir: str) -> list[PlatformBuildResults]:
     """Creates dockerfile and builds HAP/MONAI Application Package (HAP/MAP) image
     Args:
         args (dict): Input arguments for Packager
diff --git a/python/holoscan/cli/packager/parameters.py b/python/holoscan/cli/packager/parameters.py
index c08c873a..c5e49b97 100644
--- a/python/holoscan/cli/packager/parameters.py
+++ b/python/holoscan/cli/packager/parameters.py
@@ -19,7 +19,7 @@
 import os
 import platform
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 from ..common.constants import SDK, Constants, DefaultValues
 from ..common.dockerutils import parse_docker_image_name_and_tag
@@ -46,7 +46,7 @@ def __init__(
         if self._version is None:
             self._version = version
 
-        self._data: Dict[str, Any] = {}
+        self._data: dict[str, Any] = {}
         self._data["tag"] = tag
         self._data["base_image"] = None
         self._data["build_image"] = None
@@ -60,6 +60,7 @@ def __init__(
         self._data["target_arch"] = "aarch64" if self._arch == Arch.arm64 else "x86_64"
         self._data["cuda_deb_arch"] = "sbsa" if self._arch == Arch.arm64 else "x86_64"
         self._data["holoscan_deb_arch"] = "arm64" if self._arch == Arch.arm64 else "amd64"
+        self._data["gpu_type"] = self.platform_config.value
 
     @property
     def tag(self) -> str:
@@ -166,7 +167,7 @@ def platform_config(self) -> PlatformConfiguration:
         return self._platform_config
 
     @property
-    def to_jinja(self) -> Dict[str, Any]:
+    def to_jinja(self) -> dict[str, Any]:
         return self._data
 
     @property
@@ -365,11 +366,11 @@ def docs(self, value: Path):
             self._data["docs"] = value
 
     @property
-    def models(self) -> Dict[str, Path]:
+    def models(self) -> dict[str, Path]:
         return self._data.get("models", None)
 
     @models.setter
-    def models(self, value: Dict[str, Path]):
+    def models(self, value: dict[str, Path]):
         if value is not None:
             self._data["models"] = value
 
@@ -519,7 +520,7 @@ def includes(self, value: str):
         self._data["includes"] = value
 
     @property
-    def to_jinja(self) -> Dict[str, Any]:
+    def to_jinja(self) -> dict[str, Any]:
         return self._data
 
     def _detect_application_type(self) -> ApplicationType:
diff --git a/python/holoscan/cli/packager/platforms.py b/python/holoscan/cli/packager/platforms.py
index c43c10fc..22170b49 100644
--- a/python/holoscan/cli/packager/platforms.py
+++ b/python/holoscan/cli/packager/platforms.py
@@ -18,7 +18,7 @@
 import logging
 from argparse import Namespace
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 from ..common.artifact_sources import ArtifactSources
 from ..common.constants import Constants
@@ -44,7 +44,7 @@ def configure_platforms(
         temp_dir: str,
         version: str,
         application_type: ApplicationType,
-    ) -> Tuple[SdkType, str, str, List[PlatformParameters]]:
+    ) -> tuple[SdkType, str, str, list[PlatformParameters]]:
         """Configures a list of platforms that need to be built.
         1. Detect the SDK to use
         2. Detect the version of the SDK to use
@@ -146,7 +146,7 @@ def _find_base_image(
         platform_parameters: PlatformParameters,
         sdk_version: str,
         base_image: Optional[str] = None,
-    ) -> Tuple[bool, str]:
+    ) -> tuple[bool, str]:
         """
         Ensure user provided base image exists in Docker or locate the base image to use based on
         request platform.
@@ -233,9 +233,9 @@ def _select_sdk_file(
         application_type: ApplicationType,
         holoscan_sdk_file: Optional[Path] = None,
         monai_deploy_sdk_file: Optional[Path] = None,
-    ) -> Tuple[
-        Tuple[bool, Union[Path, str]],
-        Tuple[Union[Optional[Path], Optional[str]], Union[Optional[Path], Optional[str]]],
+    ) -> tuple[
+        tuple[bool, Union[Path, str]],
+        tuple[Union[Optional[Path], Optional[str]], Union[Optional[Path], Optional[str]]],
     ]:
         """
         Detects the SDK distributable to use based on internal mapping or user input.
@@ -297,7 +297,7 @@ def _get_holoscan_sdk(
         sdk_version: str,
         application_type: ApplicationType,
         sdk_file: Optional[Path] = None,
-    ) -> Tuple[bool, Union[Path, str]]:
+    ) -> tuple[bool, Union[Path, str]]:
         """
         Validates Holoscan SDK redistributable file if specified.
         Otherwise, attempt to download the SDK file from internet.
@@ -376,7 +376,7 @@ def _get_holoscan_sdk(
 
     def _get_monai_deploy_sdk(
         self, monai_deploy_app_sdk_version: Optional[str], sdk_file: Optional[Path] = None
-    ) -> Tuple[bool, Union[Optional[Path], Optional[str]]]:
+    ) -> tuple[bool, Union[Optional[Path], Optional[str]]]:
         """
         Validates MONAI Deploy SDK redistributable file if specified.
         Otherwise, Docker build stage will install the SDK from PyPI.
diff --git a/python/holoscan/cli/packager/templates/Dockerfile.jinja2 b/python/holoscan/cli/packager/templates/Dockerfile.jinja2
index 39c6321b..9a7fcbdc 100644
--- a/python/holoscan/cli/packager/templates/Dockerfile.jinja2
+++ b/python/holoscan/cli/packager/templates/Dockerfile.jinja2
@@ -56,8 +56,8 @@ RUN apt-get update \
 FROM base AS torch-dependencies
 
 ARG GPU_TYPE
-ARG TORCHVISION_VERSION=0.16.0_23.08
-ARG LIBTORCH_VERSION=2.1.0_23.08
+ARG TORCHVISION_VERSION=0.20.0_24.08
+ARG LIBTORCH_VERSION=2.5.0_24.08
 
 # Install openmpi
 RUN apt update && \
@@ -92,6 +92,7 @@ RUN curl -S -# -o hpcx.tbz -L \
     tar -xvjf hpcx.tbz hpcx-v2.15-gcc-inbox-ubuntu22.04-cuda12-gdrcopy2-nccl2.17-{{target_arch}}/ucc/lib/libucc.so.1.0.0 && \
     rm -f hpcx.tbz && \
     find . -name libucc.so.1.0.0 -exec mv -f {} /opt/hpcx/libucc.so.1 \;
+
 # End collect torch dependencies
 {% endif %}
 
@@ -100,16 +101,16 @@ RUN curl -S -# -o hpcx.tbz -L \
 # Collect onnx dependencies
 FROM base AS onnx-dependencies
 ARG GPU_TYPE
-ARG ONNX_RUNTIME_VERSION=1.15.1_23.08
+ARG ONNX_RUNTIME_VERSION=1.18.1_38712740_24.08-cuda-12.6
 
 WORKDIR /opt/onnxruntime
 
 # Download onnx binaries
-RUN curl -S -L -# -o ort.tgz \
-    https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/onnxruntime/onnxruntime-${ONNX_RUNTIME_VERSION}-cuda-12.2-$(uname -m).tar.gz
+RUN curl -S -L -# -o ort.tar.gz \
+    https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/onnxruntime/onnxruntime-${ONNX_RUNTIME_VERSION}-$(uname -m).tar.gz
 RUN mkdir -p ${ONNX_RUNTIME_VERSION}
-RUN tar -xf ort.tgz -C ${ONNX_RUNTIME_VERSION} --strip-components 2 && \
-    rm -f ort.tgz5
+RUN ls -l && tar -xvzf ort.tar.gz -C ${ONNX_RUNTIME_VERSION} --strip-components 2 && \
+    rm -f ort.tar.gz
 WORKDIR /
 # End collect onnx dependencies
 {% endif %}
@@ -229,9 +230,23 @@ RUN apt update \
         libopenblas0="0.3.20+ds-*" \
         libevent-core-2.1-7 \
         libevent-pthreads-2.1-7 \
-        cuda-cupti-12-2 \
+        cuda-cupti-12-6 \
+        libcudnn9-cuda-12 \
     && rm -rf /var/lib/apt/lists/*
 
+# Install NVIDIA Performance Libraries on arm64 dGPU platform
+# as a runtime requirement for the Holoinfer `libtorch` backend (2.5.0).
+{% if target_arch == "aarch64" and gpu_type == "dgpu" %}
+RUN curl -L https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/cuda-keyring_1.1-1_all.deb -O \
+    && dpkg -i cuda-keyring_1.1-1_all.deb \
+    && apt-get update \
+    && apt-get install --no-install-recommends -y \
+        nvpl-blas=0.2.0.1-* \
+        nvpl-lapack=0.2.2.1-* \
+    && rm -rf /var/lib/apt/lists/*
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/sbsa-linux-gnu/
+{% endif %}
+
 #  mkl - dependency for libtorch plugin on x86_64 (match pytorch container version)
 RUN if [ "{{ cuda_deb_arch }}" = "x86_64" ]; then \
         python3 -m pip install --no-cache-dir \
@@ -244,12 +259,12 @@ RUN if [ "{{ cuda_deb_arch }}" = "x86_64" ]; then \
     fi
 
 # Copy Libtorch
-ARG LIBTORCH_VERSION=2.1.0_23.08
+ARG LIBTORCH_VERSION=2.5.0_24.08
 ENV LIBTORCH=/opt/libtorch/${LIBTORCH_VERSION}/lib
 COPY --from=torch-dependencies ${LIBTORCH} ${LIBTORCH}
 
 # Copy TorchVision
-ARG TORCHVISION_VERSION=0.16.0_23.08
+ARG TORCHVISION_VERSION=0.20.0_24.08
 ENV TORCHVISION=/opt/torchvision/${TORCHVISION_VERSION}/lib
 COPY --from=torch-dependencies ${TORCHVISION} ${TORCHVISION}
 
@@ -267,20 +282,24 @@ WORKDIR /
 
 {% if 'onnx' in includes %}
 # Install onnx dependencies
-ARG ONNX_RUNTIME_VERSION=1.15.1_23.08
+ARG ONNX_RUNTIME_VERSION=1.18.1_38712740_24.08-cuda-12.6
 ENV ONNX_RUNTIME=/opt/onnxruntime/${ONNX_RUNTIME_VERSION}/lib
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNX_RUNTIME}
 
 # Copy ONNX Runtime
 COPY --from=onnx-dependencies ${ONNX_RUNTIME} ${ONNX_RUNTIME}
 
-RUN if [ "${GPU_TYPE}" = "dgpu" ]; then apt-get update \
+{% if gpu_type == "dgpu" %}
+RUN apt-get update \
     && apt-get install --no-install-recommends --no-install-suggests --allow-downgrades -y \
-        libnvinfer-plugin8="8.6.*+cuda12.0" \
-        libnvonnxparsers8="8.6.*+cuda12.0" \
-        ; fi \
+        libnvinfer10="10.3.*+cuda12.5" \
+        libnvinfer-plugin10="10.3.*+cuda12.5" \
+        libnvonnxparsers10="10.3.*+cuda12.5" \
+        libcusparselt0="0.6.2.3-*" \
+        libcudnn9-cuda-12  \
     && rm -rf /var/lib/apt/lists/* \
     && rm -f /usr/lib/*/libcudnn*train.so*
+{% endif %}
 ### End install onnx dependencies
 {% endif %}
 
@@ -357,6 +376,7 @@ RUN apt-get install -y --no-install-recommends --no-install-suggests \
 RUN apt-get install -y --no-install-recommends --no-install-suggests \
         holoscan={{ holoscan_sdk_filename }} \
     # && apt-get remove -y g++ g++-11 gcc gcc-11 gcc-11-base build-essential \
+    && apt-get purge -y cuda-keyring \
     && rm -rf /var/lib/apt/lists/*
 
 {% endif %}
diff --git a/python/holoscan/cli/runner/resources.py b/python/holoscan/cli/runner/resources.py
index 20412335..e6dd0c56 100644
--- a/python/holoscan/cli/runner/resources.py
+++ b/python/holoscan/cli/runner/resources.py
@@ -17,7 +17,7 @@
 
 import logging
 import re
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 
 from ..common.constants import Constants, DefaultValues
 from ..common.exceptions import InvalidSharedMemoryValueError
@@ -91,7 +91,7 @@ def _read_shm_size_from_config(pkg_info: dict, worker: bool, driver: bool, fragm
 
 
 def _find_maximum_shared_memory_value_from_matching_fragments(
-    resources_fragments: Dict, fragments: str
+    resources_fragments: dict, fragments: str
 ) -> Optional[float]:
     """Scan matching fragments for the maximum shared memory value.
 
@@ -116,7 +116,7 @@ def _find_maximum_shared_memory_value_from_matching_fragments(
 
 
 def _find_maximum_shared_memory_value_from_all_fragments(
-    resources_fragments: Dict,
+    resources_fragments: dict,
 ) -> Optional[float]:
     """Scan all fragments for the maximum shared memory value.
 
diff --git a/python/holoscan/cli/runner/run_command.py b/python/holoscan/cli/runner/run_command.py
index c62e1ec0..e7c2ddad 100644
--- a/python/holoscan/cli/runner/run_command.py
+++ b/python/holoscan/cli/runner/run_command.py
@@ -18,7 +18,6 @@
 import logging
 import os
 from argparse import ArgumentParser, HelpFormatter, _SubParsersAction
-from typing import List
 
 from ..common import argparse_types
 from ..common.argparse_types import valid_existing_path
@@ -27,7 +26,7 @@
 
 
 def create_run_parser(
-    subparser: _SubParsersAction, command: str, parents: List[ArgumentParser]
+    subparser: _SubParsersAction, command: str, parents: list[ArgumentParser]
 ) -> ArgumentParser:
     parser: ArgumentParser = subparser.add_parser(
         command, formatter_class=HelpFormatter, parents=parents, add_help=False
diff --git a/python/holoscan/cli/runner/runner.py b/python/holoscan/cli/runner/runner.py
index 2ceb1a0b..3345bbe5 100644
--- a/python/holoscan/cli/runner/runner.py
+++ b/python/holoscan/cli/runner/runner.py
@@ -25,7 +25,7 @@
 from argparse import Namespace
 from glob import glob
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from ..common.dockerutils import create_or_use_network, docker_run, image_exists
 from ..common.exceptions import ManifestReadError, UnmatchedDeviceError
@@ -41,7 +41,7 @@
 logger = logging.getLogger("runner")
 
 
-def _fetch_map_manifest(map_name: str) -> Tuple[dict, dict]:
+def _fetch_map_manifest(map_name: str) -> tuple[dict, dict]:
     """
     Execute HAP/MAP and fetch the manifest files.
 
@@ -164,7 +164,7 @@ def _run_app(args: Namespace, app_info: dict, pkg_info: dict):
     )
 
 
-def _lookup_devices(devices: List[str]) -> List[str]:
+def _lookup_devices(devices: list[str]) -> list[str]:
     """
     Looks up matching devices in /dev and returns a list
     of fully qualified device paths.
diff --git a/python/holoscan/conditions/CMakeLists.txt b/python/holoscan/conditions/CMakeLists.txt
index b799cffe..973bbd83 100644
--- a/python/holoscan/conditions/CMakeLists.txt
+++ b/python/holoscan/conditions/CMakeLists.txt
@@ -18,8 +18,11 @@ holoscan_pybind11_module(conditions
     boolean.cpp
     conditions.cpp
     count.cpp
+    cuda_buffer_available.cpp
+    cuda_event.cpp
+    cuda_stream.cpp
     downstream_message_affordable.cpp
-    message_available.cpp
     expiring_message.cpp
+    message_available.cpp
     periodic.cpp
 )
diff --git a/python/holoscan/conditions/__init__.py b/python/holoscan/conditions/__init__.py
index 69f6f4d3..f4c70ebf 100644
--- a/python/holoscan/conditions/__init__.py
+++ b/python/holoscan/conditions/__init__.py
@@ -19,6 +19,9 @@
     holoscan.conditions.AsynchronousCondition
     holoscan.conditions.BooleanCondition
     holoscan.conditions.CountCondition
+    holoscan.conditions.CudaBufferAvailableCondition
+    holoscan.conditions.CudaEventCondition
+    holoscan.conditions.CudaStreamCondition
     holoscan.conditions.DownstreamMessageAffordableCondition
     holoscan.conditions.ExpiringMessageAvailableCondition
     holoscan.conditions.MessageAvailableCondition
@@ -30,6 +33,9 @@
     AsynchronousEventState,
     BooleanCondition,
     CountCondition,
+    CudaBufferAvailableCondition,
+    CudaEventCondition,
+    CudaStreamCondition,
     DownstreamMessageAffordableCondition,
     ExpiringMessageAvailableCondition,
     MessageAvailableCondition,
@@ -41,6 +47,9 @@
     "AsynchronousEventState",
     "BooleanCondition",
     "CountCondition",
+    "CudaBufferAvailableCondition",
+    "CudaEventCondition",
+    "CudaStreamCondition",
     "DownstreamMessageAffordableCondition",
     "ExpiringMessageAvailableCondition",
     "MessageAvailableCondition",
diff --git a/python/holoscan/conditions/asynchronous.cpp b/python/holoscan/conditions/asynchronous.cpp
index 9f8e8292..37b61044 100644
--- a/python/holoscan/conditions/asynchronous.cpp
+++ b/python/holoscan/conditions/asynchronous.cpp
@@ -27,11 +27,8 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -54,12 +51,11 @@ class PyAsynchronousCondition : public AsynchronousCondition {
 
   // Define a constructor that fully initializes the object.
   explicit PyAsynchronousCondition(Fragment* fragment,
-                                   const std::string& name = "noname_async_condition")
-      : AsynchronousCondition() {
+                                   const std::string& name = "noname_async_condition") {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/conditions/boolean.cpp b/python/holoscan/conditions/boolean.cpp
index 8afb56c7..5f369c81 100644
--- a/python/holoscan/conditions/boolean.cpp
+++ b/python/holoscan/conditions/boolean.cpp
@@ -26,8 +26,8 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -49,13 +49,13 @@ class PyBooleanCondition : public BooleanCondition {
   using BooleanCondition::BooleanCondition;
 
   // Define a constructor that fully initializes the object.
-  PyBooleanCondition(Fragment* fragment, bool enable_tick = true,
-                     const std::string& name = "noname_boolean_condition")
+  explicit PyBooleanCondition(Fragment* fragment, bool enable_tick = true,
+                              const std::string& name = "noname_boolean_condition")
       : BooleanCondition(Arg{"enable_tick", enable_tick}) {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/conditions/conditions.cpp b/python/holoscan/conditions/conditions.cpp
index 3b620c08..5ff5a5ae 100644
--- a/python/holoscan/conditions/conditions.cpp
+++ b/python/holoscan/conditions/conditions.cpp
@@ -17,9 +17,6 @@
 
 #include <pybind11/pybind11.h>
 
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -27,6 +24,9 @@ namespace holoscan {
 void init_asynchronous(py::module_&);
 void init_boolean(py::module_&);
 void init_count(py::module_&);
+void init_cuda_buffer_available(py::module_&);
+void init_cuda_event(py::module_&);
+void init_cuda_stream(py::module_&);
 void init_periodic(py::module_&);
 void init_downstream_message_affordable(py::module_&);
 void init_message_available(py::module_&);
@@ -42,6 +42,9 @@ PYBIND11_MODULE(_conditions, m) {
   init_asynchronous(m);
   init_boolean(m);
   init_count(m);
+  init_cuda_buffer_available(m);
+  init_cuda_event(m);
+  init_cuda_stream(m);
   init_periodic(m);
   init_downstream_message_affordable(m);
   init_message_available(m);
diff --git a/python/holoscan/conditions/count.cpp b/python/holoscan/conditions/count.cpp
index 2ff0be5f..bba528b2 100644
--- a/python/holoscan/conditions/count.cpp
+++ b/python/holoscan/conditions/count.cpp
@@ -27,11 +27,8 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -53,13 +50,13 @@ class PyCountCondition : public CountCondition {
   using CountCondition::CountCondition;
 
   // Define a constructor that fully initializes the object.
-  PyCountCondition(Fragment* fragment, int64_t count = 1L,
-                   const std::string& name = "noname_count_condition")
+  explicit PyCountCondition(Fragment* fragment, int64_t count = 1L,
+                            const std::string& name = "noname_count_condition")
       : CountCondition(Arg{"count", count}) {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/conditions/cuda_buffer_available.cpp b/python/holoscan/conditions/cuda_buffer_available.cpp
new file mode 100644
index 00000000..03662fa2
--- /dev/null
+++ b/python/holoscan/conditions/cuda_buffer_available.cpp
@@ -0,0 +1,80 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pybind11/pybind11.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "./cuda_buffer_available_pydoc.hpp"
+#include "holoscan/core/component_spec.hpp"
+#include "holoscan/core/conditions/gxf/cuda_buffer_available.hpp"
+#include "holoscan/core/fragment.hpp"
+#include "holoscan/core/gxf/gxf_resource.hpp"
+
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
+
+namespace py = pybind11;
+
+namespace holoscan {
+
+/* Trampoline classes for handling Python kwargs
+ *
+ * These add a constructor that takes a Fragment for which to initialize the condition.
+ * The explicit parameter list and default arguments take care of providing a Pythonic
+ * kwarg-based interface with appropriate default values matching the condition's
+ * default parameters in the C++ API `setup` method.
+ *
+ * The sequence of events in this constructor is based on Fragment::make_condition<ConditionT>
+ */
+
+class PyCudaBufferAvailableCondition : public CudaBufferAvailableCondition {
+ public:
+  /* Inherit the constructors */
+  using CudaBufferAvailableCondition::CudaBufferAvailableCondition;
+
+  // Define a constructor that fully initializes the object.
+  explicit PyCudaBufferAvailableCondition(
+      Fragment* fragment, const std::string& name = "noname_cuda_buffer_available_condition") {
+    name_ = name;
+    fragment_ = fragment;
+    spec_ = std::make_shared<ComponentSpec>(fragment);
+    setup(*spec_);
+  }
+};
+
+void init_cuda_buffer_available(py::module_& m) {
+  py::class_<CudaBufferAvailableCondition,
+             PyCudaBufferAvailableCondition,
+             gxf::GXFCondition,
+             std::shared_ptr<CudaBufferAvailableCondition>>(
+      m,
+      "CudaBufferAvailableCondition",
+      doc::CudaBufferAvailableCondition::doc_CudaBufferAvailableCondition)
+      .def(py::init<Fragment*, const std::string&>(),
+           "fragment"_a,
+           "name"_a = "noname_cuda_buffer_available_condition"s,
+           doc::CudaBufferAvailableCondition::doc_CudaBufferAvailableCondition)
+      .def_property("receiver",
+                    py::overload_cast<>(&CudaBufferAvailableCondition::receiver),
+                    py::overload_cast<std::shared_ptr<gxf::GXFResource>>(
+                        &CudaBufferAvailableCondition::receiver),
+                    doc::CudaBufferAvailableCondition::doc_receiver);
+}
+}  // namespace holoscan
diff --git a/python/holoscan/conditions/cuda_buffer_available_pydoc.hpp b/python/holoscan/conditions/cuda_buffer_available_pydoc.hpp
new file mode 100644
index 00000000..393f455b
--- /dev/null
+++ b/python/holoscan/conditions/cuda_buffer_available_pydoc.hpp
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PYHOLOSCAN_CONDITIONS_CUDA_BUFFER_AVAILABLE_PYDOC_HPP
+#define PYHOLOSCAN_CONDITIONS_CUDA_BUFFER_AVAILABLE_PYDOC_HPP
+
+#include <string>
+
+#include "../macros.hpp"
+
+namespace holoscan::doc {
+
+namespace CudaBufferAvailableCondition {
+
+PYDOC(CudaBufferAvailableCondition, R"doc(
+Condition based on data availability in a cuda buffer.
+
+A component which specifies the availability of data at the receiver based on the cuda buffers
+present in incoming messages.
+
+Parameters
+----------
+fragment : holoscan.core.Fragment
+    The fragment the condition will be associated with.
+name : str, optional
+    The name of the condition.
+)doc")
+
+PYDOC(receiver, R"doc(
+The receiver associated with the condition.
+)doc")
+
+}  // namespace CudaBufferAvailableCondition
+
+}  // namespace holoscan::doc
+
+#endif /* PYHOLOSCAN_CONDITIONS_CUDA_BUFFER_AVAILABLE_PYDOC_HPP */
diff --git a/python/holoscan/conditions/cuda_event.cpp b/python/holoscan/conditions/cuda_event.cpp
new file mode 100644
index 00000000..1e9797af
--- /dev/null
+++ b/python/holoscan/conditions/cuda_event.cpp
@@ -0,0 +1,80 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pybind11/pybind11.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "./cuda_event_pydoc.hpp"
+#include "holoscan/core/component_spec.hpp"
+#include "holoscan/core/conditions/gxf/cuda_event.hpp"
+#include "holoscan/core/fragment.hpp"
+#include "holoscan/core/gxf/gxf_resource.hpp"
+
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
+
+namespace py = pybind11;
+
+namespace holoscan {
+
+/* Trampoline classes for handling Python kwargs
+ *
+ * These add a constructor that takes a Fragment for which to initialize the condition.
+ * The explicit parameter list and default arguments take care of providing a Pythonic
+ * kwarg-based interface with appropriate default values matching the condition's
+ * default parameters in the C++ API `setup` method.
+ *
+ * The sequence of events in this constructor is based on Fragment::make_condition<ConditionT>
+ */
+
+class PyCudaEventCondition : public CudaEventCondition {
+ public:
+  /* Inherit the constructors */
+  using CudaEventCondition::CudaEventCondition;
+
+  // Define a constructor that fully initializes the object.
+  explicit PyCudaEventCondition(Fragment* fragment, const std::string& event_name = "",
+                                const std::string& name = "noname_cuda_event_condition")
+      : CudaEventCondition(Arg("event_name", event_name)) {
+    name_ = name;
+    fragment_ = fragment;
+    spec_ = std::make_shared<ComponentSpec>(fragment);
+    setup(*spec_);
+  }
+};
+
+void init_cuda_event(py::module_& m) {
+  py::class_<CudaEventCondition,
+             PyCudaEventCondition,
+             gxf::GXFCondition,
+             std::shared_ptr<CudaEventCondition>>(
+      m, "CudaEventCondition", doc::CudaEventCondition::doc_CudaEventCondition)
+      .def(py::init<Fragment*, const std::string&, const std::string&>(),
+           "fragment"_a,
+           "event_name"_a = ""s,
+           "name"_a = "noname_cuda_event_condition"s,
+           doc::CudaEventCondition::doc_CudaEventCondition)
+      .def_property(
+          "receiver",
+          py::overload_cast<>(&CudaEventCondition::receiver),
+          py::overload_cast<std::shared_ptr<gxf::GXFResource>>(&CudaEventCondition::receiver),
+          doc::CudaEventCondition::doc_receiver);
+}
+}  // namespace holoscan
diff --git a/python/holoscan/conditions/cuda_event_pydoc.hpp b/python/holoscan/conditions/cuda_event_pydoc.hpp
new file mode 100644
index 00000000..7b2ddaaa
--- /dev/null
+++ b/python/holoscan/conditions/cuda_event_pydoc.hpp
@@ -0,0 +1,54 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PYHOLOSCAN_CONDITIONS_CUDA_EVENT_PYDOC_HPP
+#define PYHOLOSCAN_CONDITIONS_CUDA_EVENT_PYDOC_HPP
+
+#include <string>
+
+#include "../macros.hpp"
+
+namespace holoscan::doc {
+
+namespace CudaEventCondition {
+
+PYDOC(CudaEventCondition, R"doc(
+Condition class to indicate data availability on CUDA stream completion via an event.
+
+A condition which specifies the availability of data at the receiver on completion of the work on
+the provided cuda stream with the help of cuda event. This condition will keep polling on the
+event provided to check for data availability for consumption.
+
+Parameters
+----------
+fragment : holoscan.core.Fragment
+    The fragment the condition will be associated with.
+event_name : event, optional
+    The event name on which the cudaEventQuery API is called to get the status.
+name : str, optional
+    The name of the condition.
+)doc")
+
+PYDOC(receiver, R"doc(
+The receiver associated with the condition.
+)doc")
+
+}  // namespace CudaEventCondition
+
+}  // namespace holoscan::doc
+
+#endif /* PYHOLOSCAN_CONDITIONS_CUDA_EVENT_PYDOC_HPP */
diff --git a/python/holoscan/conditions/cuda_stream.cpp b/python/holoscan/conditions/cuda_stream.cpp
new file mode 100644
index 00000000..e9523810
--- /dev/null
+++ b/python/holoscan/conditions/cuda_stream.cpp
@@ -0,0 +1,78 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pybind11/pybind11.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "./cuda_stream_pydoc.hpp"
+#include "holoscan/core/component_spec.hpp"
+#include "holoscan/core/conditions/gxf/cuda_stream.hpp"
+#include "holoscan/core/fragment.hpp"
+#include "holoscan/core/gxf/gxf_resource.hpp"
+
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
+
+namespace py = pybind11;
+
+namespace holoscan {
+
+/* Trampoline classes for handling Python kwargs
+ *
+ * These add a constructor that takes a Fragment for which to initialize the condition.
+ * The explicit parameter list and default arguments take care of providing a Pythonic
+ * kwarg-based interface with appropriate default values matching the condition's
+ * default parameters in the C++ API `setup` method.
+ *
+ * The sequence of events in this constructor is based on Fragment::make_condition<ConditionT>
+ */
+
+class PyCudaStreamCondition : public CudaStreamCondition {
+ public:
+  /* Inherit the constructors */
+  using CudaStreamCondition::CudaStreamCondition;
+
+  // Define a constructor that fully initializes the object.
+  explicit PyCudaStreamCondition(Fragment* fragment,
+                                 const std::string& name = "noname_cuda_stream_condition") {
+    name_ = name;
+    fragment_ = fragment;
+    spec_ = std::make_shared<ComponentSpec>(fragment);
+    setup(*spec_);
+  }
+};
+
+void init_cuda_stream(py::module_& m) {
+  py::class_<CudaStreamCondition,
+             PyCudaStreamCondition,
+             gxf::GXFCondition,
+             std::shared_ptr<CudaStreamCondition>>(
+      m, "CudaStreamCondition", doc::CudaStreamCondition::doc_CudaStreamCondition)
+      .def(py::init<Fragment*, const std::string&>(),
+           "fragment"_a,
+           "name"_a = "noname_cuda_stream_condition"s,
+           doc::CudaStreamCondition::doc_CudaStreamCondition)
+      .def_property(
+          "receiver",
+          py::overload_cast<>(&CudaStreamCondition::receiver),
+          py::overload_cast<std::shared_ptr<gxf::GXFResource>>(&CudaStreamCondition::receiver),
+          doc::CudaStreamCondition::doc_receiver);
+}
+}  // namespace holoscan
diff --git a/python/holoscan/conditions/cuda_stream_pydoc.hpp b/python/holoscan/conditions/cuda_stream_pydoc.hpp
new file mode 100644
index 00000000..f2e9ef88
--- /dev/null
+++ b/python/holoscan/conditions/cuda_stream_pydoc.hpp
@@ -0,0 +1,51 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PYHOLOSCAN_CONDITIONS_CUDA_STREAM_PYDOC_HPP
+#define PYHOLOSCAN_CONDITIONS_CUDA_STREAM_PYDOC_HPP
+
+#include <string>
+
+#include "../macros.hpp"
+
+namespace holoscan::doc {
+
+namespace CudaStreamCondition {
+
+PYDOC(CudaStreamCondition, R"doc(
+Condition class to indicate data availability on CUDA stream completion.
+
+This condition will register a callback function which will be called once the work on the
+specified CUDA stream completes indicating that the data is available for consumption.
+
+Parameters
+----------
+fragment : holoscan.core.Fragment
+    The fragment the condition will be associated with.
+name : str, optional
+    The name of the condition.
+)doc")
+
+PYDOC(receiver, R"doc(
+The receiver associated with the condition.
+)doc")
+
+}  // namespace CudaStreamCondition
+
+}  // namespace holoscan::doc
+
+#endif /* PYHOLOSCAN_CONDITIONS_CUDA_STREAM_PYDOC_HPP */
diff --git a/python/holoscan/conditions/downstream_message_affordable.cpp b/python/holoscan/conditions/downstream_message_affordable.cpp
index dede74ab..a45ea6cb 100644
--- a/python/holoscan/conditions/downstream_message_affordable.cpp
+++ b/python/holoscan/conditions/downstream_message_affordable.cpp
@@ -27,11 +27,8 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -53,7 +50,7 @@ class PyDownstreamMessageAffordableCondition : public DownstreamMessageAffordabl
   using DownstreamMessageAffordableCondition::DownstreamMessageAffordableCondition;
 
   // Define a constructor that fully initializes the object.
-  PyDownstreamMessageAffordableCondition(
+  explicit PyDownstreamMessageAffordableCondition(
       Fragment* fragment,
       // std::shared_ptr<gxf::GXFResource> transmitter,
       // add transmitter here? gxf_uid_t eid,
@@ -63,7 +60,7 @@ class PyDownstreamMessageAffordableCondition : public DownstreamMessageAffordabl
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
     // transmitter_ = transmitter;  // e.g. DoubleBufferTransmitter
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/conditions/expiring_message.cpp b/python/holoscan/conditions/expiring_message.cpp
index 21930780..71932868 100644
--- a/python/holoscan/conditions/expiring_message.cpp
+++ b/python/holoscan/conditions/expiring_message.cpp
@@ -30,11 +30,8 @@
 #include "holoscan/core/gxf/gxf_resource.hpp"
 #include "holoscan/core/resources/gxf/realtime_clock.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -56,7 +53,7 @@ class PyExpiringMessageAvailableCondition : public ExpiringMessageAvailableCondi
   using ExpiringMessageAvailableCondition::ExpiringMessageAvailableCondition;
 
   // Define a constructor that fully initializes the object.
-  PyExpiringMessageAvailableCondition(
+  explicit PyExpiringMessageAvailableCondition(
       Fragment* fragment,
       // std::shared_ptr<gxf::GXFResource> receiver,
       int64_t max_batch_size, int64_t max_delay_ns, std::shared_ptr<Clock> clock = nullptr,
@@ -71,7 +68,7 @@ class PyExpiringMessageAvailableCondition : public ExpiringMessageAvailableCondi
     }
     spec_ = std::make_shared<ComponentSpec>(fragment);
     // receiver = receiver;  // e.g. DoubleBufferReceiver
-    setup(*spec_.get());
+    setup(*spec_);
   }
 
   template <typename Rep, typename Period>
@@ -91,7 +88,7 @@ class PyExpiringMessageAvailableCondition : public ExpiringMessageAvailableCondi
     }
     spec_ = std::make_shared<ComponentSpec>(fragment);
     // receiver = receiver;  // e.g. DoubleBufferReceiver
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -103,9 +100,9 @@ void init_expiring_message_available(py::module_& m) {
       m,
       "ExpiringMessageAvailableCondition",
       doc::ExpiringMessageAvailableCondition::doc_ExpiringMessageAvailableCondition)
-      // TODO: sphinx API doc build complains if more than one ExpiringMessageAvailableCondition
-      //       init method has a docstring specified. For now just set the docstring for the
-      //       overload using datetime.timedelta for the max_delay.
+      // TODO(unknown): sphinx API doc build complains if more than one
+      // ExpiringMessageAvailableCondition init method has a docstring specified. For now just set
+      // the docstring for the overload using datetime.timedelta for the max_delay.
       .def(py::init<Fragment*, int64_t, int64_t, std::shared_ptr<Clock>, const std::string&>(),
            "fragment"_a,
            "max_batch_size"_a,
diff --git a/python/holoscan/conditions/message_available.cpp b/python/holoscan/conditions/message_available.cpp
index 13ef15a9..fd90daf6 100644
--- a/python/holoscan/conditions/message_available.cpp
+++ b/python/holoscan/conditions/message_available.cpp
@@ -27,12 +27,8 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 namespace py = pybind11;
 
 namespace holoscan {
@@ -53,17 +49,18 @@ class PyMessageAvailableCondition : public MessageAvailableCondition {
   using MessageAvailableCondition::MessageAvailableCondition;
 
   // Define a constructor that fully initializes the object.
-  PyMessageAvailableCondition(Fragment* fragment,
-                              // std::shared_ptr<gxf::GXFResource> receiver,
-                              uint64_t min_size = 1UL, size_t front_stage_max_size = 1UL,
-                              const std::string& name = "noname_message_available_condition")
+  explicit PyMessageAvailableCondition(
+      Fragment* fragment,
+      // std::shared_ptr<gxf::GXFResource> receiver,
+      uint64_t min_size = 1UL, size_t front_stage_max_size = 1UL,
+      const std::string& name = "noname_message_available_condition")
       : MessageAvailableCondition(
             ArgList{Arg{"min_size", min_size}, Arg{"front_stage_max_size", front_stage_max_size}}) {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
     // receiver = receiver;  // e.g. DoubleBufferReceiver
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/conditions/periodic.cpp b/python/holoscan/conditions/periodic.cpp
index ce86430f..33cf1a24 100644
--- a/python/holoscan/conditions/periodic.cpp
+++ b/python/holoscan/conditions/periodic.cpp
@@ -29,11 +29,8 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -61,7 +58,7 @@ class PyPeriodicCondition : public PeriodicCondition {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
   template <typename Rep, typename Period>
   PyPeriodicCondition(Fragment* fragment, std::chrono::duration<Rep, Period> recess_period_duration,
@@ -70,7 +67,7 @@ class PyPeriodicCondition : public PeriodicCondition {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -80,7 +77,7 @@ void init_periodic(py::module_& m) {
              gxf::GXFCondition,
              std::shared_ptr<PeriodicCondition>>(
       m, "PeriodicCondition", doc::PeriodicCondition::doc_PeriodicCondition)
-      // TODO: sphinx API doc build complains if more than one PeriodicCondition init
+      // TODO(unknown): sphinx API doc build complains if more than one PeriodicCondition init
       //       method has a docstring specified. For now just set the docstring for the
       //       overload using datetime.timedelta for the recess_period.
       .def(py::init<Fragment*, int64_t, const std::string&>(),
diff --git a/python/holoscan/core/__init__.py b/python/holoscan/core/__init__.py
index de96549b..ff868177 100644
--- a/python/holoscan/core/__init__.py
+++ b/python/holoscan/core/__init__.py
@@ -384,6 +384,12 @@ def __init__(
             latency metric calculations.
         """
         self.app = app
+
+        # Check the number of fragment nodes to see if it is a distributed app.
+        # Use compose_graph(), not compose() to protect against repeated compose() calls.
+        self.app.compose_graph()
+        self.is_distributed_app = len(app.fragment_graph.get_nodes()) > 0
+
         self.enable_logging = filename is not None
         if self.enable_logging:
             self.logging_kwargs = dict(
@@ -397,14 +403,25 @@ def __init__(
         )
 
     def __enter__(self):
-        self.tracker = self.app.track(**self.tracker_kwargs)
-        if self.enable_logging:
-            self.tracker.enable_logging(**self.logging_kwargs)
-        return self.tracker
+        if self.is_distributed_app:
+            self.trackers = self.app.track_distributed(**self.tracker_kwargs)
+            for tracker in self.trackers.values():
+                if self.enable_logging:
+                    tracker.enable_logging(**self.logging_kwargs)
+            return self.trackers
+        else:
+            self.tracker = self.app.track(**self.tracker_kwargs)
+            if self.enable_logging:
+                self.tracker.enable_logging(**self.logging_kwargs)
+            return self.tracker
 
     def __exit__(self, exc_type, exc_value, exc_tb):
         if self.enable_logging:
-            self.tracker.end_logging()
+            if self.is_distributed_app:
+                for tracker in self.trackers.values():
+                    tracker.end_logging()
+            else:
+                self.tracker.end_logging()
 
 
 _registry_context = _RegistryContext()
diff --git a/python/holoscan/core/application.cpp b/python/holoscan/core/application.cpp
index 47b75b26..7ee15ed6 100644
--- a/python/holoscan/core/application.cpp
+++ b/python/holoscan/core/application.cpp
@@ -20,9 +20,11 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <functional>
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -33,7 +35,7 @@
 #include "holoscan/core/operator.hpp"
 #include "tensor.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -48,14 +50,20 @@ void init_application(py::module_& m) {
       .def(py::init<const std::vector<std::string>&>(),
            "argv"_a = std::vector<std::string>(),
            doc::Application::doc_Application)
-      .def_property("description",
-                    py::overload_cast<>(&Application::description),
-                    (Application & (Application::*)(const std::string&)&)&Application::description,
-                    doc::Application::doc_description)
-      .def_property("version",
-                    py::overload_cast<>(&Application::version),
-                    (Application & (Application::*)(const std::string&)&)&Application::version,
-                    doc::Application::doc_version)
+      .def_property(
+          "description",
+          py::overload_cast<>(&Application::description),
+          [](Application& app, const std::string& name) -> Application& {
+            return app.description(name);
+          },
+          doc::Application::doc_description)
+      .def_property(
+          "version",
+          py::overload_cast<>(&Application::version),
+          [](Application& app, const std::string& name) -> Application& {
+            return app.version(name);
+          },
+          doc::Application::doc_version)
       .def_property_readonly(
           "argv", [](PyApplication& app) { return app.py_argv(); }, doc::Application::doc_argv)
       .def_property_readonly("options",
@@ -72,9 +80,9 @@ void init_application(py::module_& m) {
            &Application::add_fragment,
            "frag"_a,
            doc::Application::doc_add_fragment)  // note: virtual function
-      // TODO: sphinx API doc build complains if more than one overloaded add_flow method has a
-      //       docstring specified. For now using the docstring defined for 3-argument
-      //       Operator-based version and describing the other variants in the Notes section.
+      // TODO(unknown): sphinx API doc build complains if more than one overloaded add_flow method
+      // has a docstring specified. For now using the docstring defined for 3-argument
+      // Operator-based version and describing the other variants in the Notes section.
       .def(  // note: virtual function
           "add_flow",
           py::overload_cast<const std::shared_ptr<Operator>&, const std::shared_ptr<Operator>&>(
@@ -101,10 +109,36 @@ void init_application(py::module_& m) {
       .def("compose",
            &Application::compose,
            doc::Application::doc_compose)  // note: virtual function
+      .def("compose_graph", &Application::compose_graph, doc::Application::doc_compose_graph)
       .def("run",
            &Application::run,
            doc::Application::doc_run,
            py::call_guard<py::gil_scoped_release>())  // note: virtual function/should release GIL
+      .def(
+          "track_distributed",
+          // This version of `track_distributed differs from the C++ API only in return type, using
+          //   std::unordered_map<std::string, std::reference_wrapper<DataFlowTracker>>
+          // instead of
+          //   std::unordered_map<std::string, DataFlowTracker*>
+          // to use the trackers from Python.
+          [](Application& app,
+             uint64_t num_start_messages_to_skip,
+             uint64_t num_last_messages_to_discard,
+             int latency_threshold)
+              -> std::unordered_map<std::string, std::reference_wrapper<DataFlowTracker>> {
+            auto tracker_pointers = app.track_distributed(
+                num_start_messages_to_skip, num_last_messages_to_discard, latency_threshold);
+            std::unordered_map<std::string, std::reference_wrapper<DataFlowTracker>> trackers;
+            for (const auto& [name, tracker_ptr] : tracker_pointers) {
+              trackers.emplace(name, std::ref(*tracker_ptr));
+            }
+            return trackers;
+          },
+          "num_start_messages_to_skip"_a = kDefaultNumStartMessagesToSkip,
+          "num_last_messages_to_discard"_a = kDefaultNumLastMessagesToDiscard,
+          "latency_threshold"_a = kDefaultLatencyThreshold,
+          // doc::Fragment::doc_track_distributed,
+          py::return_value_policy::reference_internal)
       .def(
           "__repr__",
           [](const py::object& obj) {
@@ -181,7 +215,7 @@ void PyApplication::run() {
     py_profile_func_ = sys_module.attr("getprofile")();
     py_trace_func_ = sys_module.attr("gettrace")();
 
-    auto py_thread_state = _PyThreadState_UncheckedGet();
+    auto* py_thread_state = _PyThreadState_UncheckedGet();
     c_profilefunc_ = py_thread_state->c_profilefunc;
     c_profileobj_ = py_thread_state->c_profileobj;
     c_tracefunc_ = py_thread_state->c_tracefunc;
diff --git a/python/holoscan/core/application_pydoc.hpp b/python/holoscan/core/application_pydoc.hpp
index 42f4be1d..ec54c814 100644
--- a/python/holoscan/core/application_pydoc.hpp
+++ b/python/holoscan/core/application_pydoc.hpp
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef PYHOLOSCAN_CORE_APPLICATION_PYDOC_HPP
-#define PYHOLOSCAN_CORE_APPLICATION_PYDOC_HPP
+#ifndef HOLOSCAN_CORE_APPLICATION_PYDOC_HPP
+#define HOLOSCAN_CORE_APPLICATION_PYDOC_HPP
 
 #include <string>
 
@@ -125,8 +125,13 @@ frag : holoscan.core.Fragment
 PYDOC(compose, R"doc(
 The compose method of the application.
 
-This method should be called after `config`, but before `run` in order to
-compose the computation graph.
+This method should be called after ``config``, but before the graph starts running in order to
+compose the computation graph. This method will be called automatically by ``Application.run``, so
+it is not normally necessary to call it directly.
+)doc")
+
+PYDOC(compose_graph, R"doc(
+This is a wrapper around compose that only calls compose if the graph is not already composed.
 )doc")
 
 PYDOC(run, R"doc(
@@ -136,12 +141,11 @@ This method runs the computation. It must have first been initialized via
 `config` and `compose`.
 )doc")
 
-PYDOC(track, R"doc(
-The track method of the application.
+PYDOC(track_distributed, R"doc(
+The track method of a distributed application.
 
-This method enables data frame flow tracking and returns
-a DataFlowTracker object which can be used to display metrics data
-for profiling an application.
+This method enables data frame flow tracking and returns a DataFlowTracker object which can be used
+to display metrics data for profiling an application.
 
 Parameters
 ----------
@@ -152,9 +156,17 @@ num_last_messages_to_discard : int
 latency_threshold : int
     The minimum end-to-end latency in milliseconds to account for in the
     end-to-end latency metric calculations
+
+Returns
+-------
+trackers : dict[str, holoscan.core.DataFlowTracker]
+    A dictionary where the keys are the fragment names and the values are the corresponding data
+    flow tracker for that fragment. These can be used to display metrics data for profiling along
+    the different paths through the computation graph.
 )doc")
+
 }  // namespace Application
 
 }  // namespace holoscan::doc
 
-#endif  // PYHOLOSCAN_CORE_APPLICATION_PYDOC_HPP
+#endif /* HOLOSCAN_CORE_APPLICATION_PYDOC_HPP */
diff --git a/python/holoscan/core/arg.cpp b/python/holoscan/core/arg.cpp
index 8e51c859..4ce52a4e 100644
--- a/python/holoscan/core/arg.cpp
+++ b/python/holoscan/core/arg.cpp
@@ -27,8 +27,7 @@
 #include "holoscan/core/executors/gxf/gxf_parameter_adaptor.hpp"
 #include "kwarg_handling.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -109,9 +108,8 @@ void init_arg(py::module_& m) {
            [](Arg& arg) -> py::object {
              auto result = arg_to_py_object(arg);
              if (!result.is_none()) {
-               if (py::isinstance<py::int_>(result)) {
-                 return result;
-               } else if (py::isinstance<py::float_>(result)) {
+               if (py::isinstance<py::int_>(result)) { return result; }
+               if (py::isinstance<py::float_>(result)) {
                  return py::int_(static_cast<int64_t>(result.cast<double>()));
                }
              }
@@ -121,9 +119,8 @@ void init_arg(py::module_& m) {
            [](Arg& arg) -> py::object {
              auto result = arg_to_py_object(arg);
              if (!result.is_none()) {
-               if (py::isinstance<py::float_>(result)) {
-                 return result;
-               } else if (py::isinstance<py::int_>(result)) {
+               if (py::isinstance<py::float_>(result)) { return result; }
+               if (py::isinstance<py::int_>(result)) {
                  return py::float_(static_cast<double>(result.cast<int64_t>()));
                }
              }
@@ -138,11 +135,8 @@ void init_arg(py::module_& m) {
       .def("__str__",
            [](Arg& arg) -> py::object {
              auto result = arg_to_py_object(arg);
-             if (py::isinstance<py::str>(result)) {
-               return result;
-             } else if (!result.is_none()) {
-               return py::str(result);
-             }
+             if (py::isinstance<py::str>(result)) { return result; }
+             if (!result.is_none()) { return py::str(result); }
              return py::str();
            })
       .def_property_readonly("description", &Arg::description, doc::Arg::doc_description)
diff --git a/python/holoscan/core/cli.cpp b/python/holoscan/core/cli.cpp
index 02b5430d..8c697007 100644
--- a/python/holoscan/core/cli.cpp
+++ b/python/holoscan/core/cli.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,8 +28,7 @@
 #include "holoscan/core/executors/gxf/gxf_parameter_adaptor.hpp"
 #include "kwarg_handling.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
diff --git a/python/holoscan/core/component.cpp b/python/holoscan/core/component.cpp
index 5db222ca..4e694f7b 100644
--- a/python/holoscan/core/component.cpp
+++ b/python/holoscan/core/component.cpp
@@ -29,7 +29,7 @@
 #include "holoscan/core/component.hpp"
 #include "holoscan/core/fragment.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -45,7 +45,7 @@ void PyComponentSpec::py_param(const std::string& name, const py::object& defaul
   std::string headline{""s};
   std::string description{""s};
   for (const auto& [nm, value] : kwargs) {
-    std::string param_name = nm.cast<std::string>();
+    auto param_name = nm.cast<std::string>();
     if (param_name == "headline") {
       headline = value.cast<std::string>();
     } else if (param_name == "description") {
diff --git a/python/holoscan/core/condition.cpp b/python/holoscan/core/condition.cpp
index da00914f..170c0f69 100644
--- a/python/holoscan/core/condition.cpp
+++ b/python/holoscan/core/condition.cpp
@@ -29,8 +29,6 @@
 #include "holoscan/core/fragment.hpp"
 #include "kwarg_handling.hpp"
 
-using pybind11::literals::operator""_a;
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -42,12 +40,12 @@ class PyCondition : public Condition {
 
   // Define a kwargs-based constructor that can create an ArgList
   // for passing on to the variadic-template based constructor.
-  PyCondition(const py::args& args, const py::kwargs& kwargs) : Condition() {
+  PyCondition(const py::args& args, const py::kwargs& kwargs) {
     using std::string_literals::operator""s;
 
     int n_fragments = 0;
-    for (auto& item : args) {
-      py::object arg_value = item.cast<py::object>();
+    for (const auto& item : args) {
+      auto arg_value = item.cast<py::object>();
       if (py::isinstance<Fragment>(arg_value)) {
         if (n_fragments > 0) { throw std::runtime_error("multiple Fragment objects provided"); }
         fragment_ = arg_value.cast<Fragment*>();
@@ -57,8 +55,8 @@ class PyCondition : public Condition {
       }
     }
     for (const auto& [name, value] : kwargs) {
-      std::string kwarg_name = name.cast<std::string>();
-      py::object kwarg_value = value.cast<py::object>();
+      auto kwarg_name = name.cast<std::string>();
+      auto kwarg_value = value.cast<py::object>();
       if (kwarg_name == "name"s) {
         if (py::isinstance<py::str>(kwarg_value)) {
           name_ = kwarg_value.cast<std::string>();
@@ -107,10 +105,11 @@ void init_condition(py::module_& m) {
       m, "Condition", doc::Condition::doc_Condition)
       .def(py::init<const py::args&, const py::kwargs&>(),
            doc::Condition::doc_Condition_args_kwargs)
-      .def_property("name",
-                    py::overload_cast<>(&Condition::name, py::const_),
-                    (Condition & (Condition::*)(const std::string&)&)&Condition::name,
-                    doc::Condition::doc_name)
+      .def_property(
+          "name",
+          py::overload_cast<>(&Condition::name, py::const_),
+          [](Condition& c, const std::string& name) -> Condition& { return c.name(name); },
+          doc::Condition::doc_name)
       .def_property_readonly(
           "fragment", py::overload_cast<>(&Condition::fragment), doc::Condition::doc_fragment)
       .def_property("spec",
diff --git a/python/holoscan/core/core.cpp b/python/holoscan/core/core.cpp
index c0a2887c..3ffecba1 100644
--- a/python/holoscan/core/core.cpp
+++ b/python/holoscan/core/core.cpp
@@ -26,8 +26,6 @@
 #include "tensor.hpp"
 #include "operator.hpp"
 
-namespace py = pybind11;
-
 namespace holoscan {
 
 PYBIND11_MODULE(_core, m) {
diff --git a/python/holoscan/core/core.hpp b/python/holoscan/core/core.hpp
index afc182e8..9a4e3f0f 100644
--- a/python/holoscan/core/core.hpp
+++ b/python/holoscan/core/core.hpp
@@ -27,9 +27,6 @@
 
 #include "holoscan/core/domain/tensor.hpp"
 
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
 namespace py = pybind11;
 
 namespace holoscan {
diff --git a/python/holoscan/core/dataflow_tracker.cpp b/python/holoscan/core/dataflow_tracker.cpp
index ab888bce..3150f853 100644
--- a/python/holoscan/core/dataflow_tracker.cpp
+++ b/python/holoscan/core/dataflow_tracker.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,7 +24,7 @@
 #include "dataflow_tracker_pydoc.hpp"
 #include "holoscan/core/dataflow_tracker.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -48,9 +48,9 @@ void init_data_flow_tracker(py::module_& m) {
            "num_buffered_messages"_a = kDefaultNumBufferedMessages,
            doc::DataFlowTracker::doc_enable_logging)
       .def("end_logging", &DataFlowTracker::end_logging, doc::DataFlowTracker::doc_end_logging)
-      // TODO: sphinx API doc build complains if more than one overloaded get_metric method has a
-      //       docstring specified. For now using the docstring defined for 2-argument
-      //       version and describe the single argument variant in the Notes section.
+      // TODO(unknown): sphinx API doc build complains if more than one overloaded get_metric method
+      // has a docstring specified. For now using the docstring defined for 2-argument
+      // version and describe the single argument variant in the Notes section.
       .def("get_metric",
            py::overload_cast<std::string, DataFlowMetric>(&DataFlowTracker::get_metric),
            "pathstring"_a,
diff --git a/python/holoscan/core/dl_converter.cpp b/python/holoscan/core/dl_converter.cpp
index 6f34633b..49f9d623 100644
--- a/python/holoscan/core/dl_converter.cpp
+++ b/python/holoscan/core/dl_converter.cpp
@@ -30,12 +30,15 @@
 #include "holoscan/core/domain/tensor.hpp"
 #include "holoscan/utils/cuda_macros.hpp"
 
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
+
 namespace holoscan {
 
-void set_array_interface(const py::object& obj, std::shared_ptr<DLManagedTensorContext> ctx) {
+void set_array_interface(const py::object& obj,
+                         const std::shared_ptr<DLManagedTensorContext>& ctx) {
   DLTensor& dl_tensor = ctx->tensor.dl_tensor;
 
-  if (dl_tensor.data) {
+  if (dl_tensor.data != nullptr) {
     // Prepare the array interface items
 
     // Main items
@@ -46,11 +49,12 @@ void set_array_interface(const py::object& obj, std::shared_ptr<DLManagedTensorC
     const char* type_str = maybe_type_str.value();
     py::tuple shape = array2pytuple<pybind11::int_>(dl_tensor.shape, dl_tensor.ndim);
     py::str typestr = py::str(type_str);
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     py::tuple data = pybind11::make_tuple(py::int_(reinterpret_cast<uint64_t>(dl_tensor.data)),
                                           py::bool_(false));
     // Optional items
     py::object strides = py::none();
-    if (dl_tensor.strides) {
+    if (dl_tensor.strides != nullptr) {
       const int32_t strides_length = dl_tensor.ndim;
       py::tuple strides_tuple(strides_length);
       // The array interface's stride is using bytes, not element size, so we need to multiply it by
@@ -115,6 +119,7 @@ void set_array_interface(const py::object& obj, std::shared_ptr<DLManagedTensorC
   }
 }
 
+// NOLINTBEGIN(readability-function-cognitive-complexity)
 py::capsule py_dlpack(Tensor* tensor, py::object stream) {
   // TOIMPROVE: need to get current stream pointer and call with the stream
   cudaStream_t curr_stream_ptr = nullptr;  // legacy stream
@@ -130,11 +135,13 @@ py::capsule py_dlpack(Tensor* tensor, py::object stream) {
       throw std::runtime_error(
           "Invalid stream, valid stream should be -1 (non-blocking), 1 (legacy default stream), 2 "
           "(per-thread default stream), or a positive integer (stream pointer)");
-    } else if (stream_id <= 2) {
+    }
+    if (stream_id <= 2) {
       // Allow the stream id 0 as a special case for the default stream.
       // This is to support the legacy behavior.
       stream_ptr = nullptr;
     } else {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr,cppcoreguidelines-pro-type-reinterpret-cast)
       stream_ptr = reinterpret_cast<cudaStream_t>(stream_id);
     }
   } else {
@@ -144,7 +151,7 @@ py::capsule py_dlpack(Tensor* tensor, py::object stream) {
 
   // Wait for the current stream to finish before the provided stream starts consuming the memory.
   if (stream_id >= 0 && curr_stream_ptr != stream_ptr) {
-    cudaEvent_t curr_stream_event;
+    cudaEvent_t curr_stream_event{};
     HOLOSCAN_CUDA_CALL_THROW_ERROR(
         cudaEventCreateWithFlags(&curr_stream_event, cudaEventDisableTiming),
         "Failure during call to cudaEventCreateWithFlags");
@@ -165,10 +172,10 @@ py::capsule py_dlpack(Tensor* tensor, py::object stream) {
     // Should call `PyCapsule_IsValid` to check if the capsule is valid before calling
     // `PyCapsule_GetPointer`. Otherwise, it will raise a hard-to-debug exception.
     // (such as `SystemError: <class 'xxx'> returned a result with an error set`)
-    if (PyCapsule_IsValid(ptr, "dltensor")) {
+    if (PyCapsule_IsValid(ptr, "dltensor") != 0) {
       // The destructor will be called when the capsule is deleted.
       // We need to call the deleter function to free the memory.
-      DLManagedTensor* dl_managed_tensor =
+      auto* dl_managed_tensor =
           static_cast<DLManagedTensor*>(PyCapsule_GetPointer(ptr, "dltensor"));
       // Call deleter function to free the memory only if the capsule name is "dltensor".
       if (dl_managed_tensor != nullptr) { dl_managed_tensor->deleter(dl_managed_tensor); }
@@ -177,6 +184,7 @@ py::capsule py_dlpack(Tensor* tensor, py::object stream) {
 
   return dlpack_capsule;
 }
+// NOLINTEND(readability-function-cognitive-complexity)
 
 py::tuple py_dlpack_device(Tensor* tensor) {
   auto& dl_tensor = tensor->dl_ctx()->tensor.dl_tensor;
diff --git a/python/holoscan/core/dl_converter.hpp b/python/holoscan/core/dl_converter.hpp
index abfa9960..ec257fbd 100644
--- a/python/holoscan/core/dl_converter.hpp
+++ b/python/holoscan/core/dl_converter.hpp
@@ -28,12 +28,6 @@
 
 #include "holoscan/core/domain/tensor.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -58,7 +52,7 @@ struct ArrayInterfaceMemoryBuffer {
  * @param obj The Python object to set the array interface object.
  * @param ctx The context of the DLManagedTensor.
  */
-void set_array_interface(const py::object& obj, std::shared_ptr<DLManagedTensorContext> ctx);
+void set_array_interface(const py::object& obj, const std::shared_ptr<DLManagedTensorContext>& ctx);
 
 /**
  * @brief Provide `__dlpack__` method
diff --git a/python/holoscan/core/emitter_receiver_registry.cpp b/python/holoscan/core/emitter_receiver_registry.cpp
index e627a020..06ba08ab 100644
--- a/python/holoscan/core/emitter_receiver_registry.cpp
+++ b/python/holoscan/core/emitter_receiver_registry.cpp
@@ -34,13 +34,13 @@ const EmitterReceiverRegistry::EmitterReceiver& EmitterReceiverRegistry::get_emi
     HOLOSCAN_LOG_WARN("No emitter_receiver for type '{}' exists", index.name());
     return EmitterReceiverRegistry::none_emitter_receiver;
   }
-  auto& emitter_receiver = emitter_receiver_map_.at(maybe_name.value());
+  const auto& emitter_receiver = emitter_receiver_map_.at(maybe_name.value());
   return emitter_receiver;
 }
 
 bool EmitterReceiverRegistry::has_emitter_receiver(const std::type_index& index) const {
   auto maybe_name = index_to_name(index);
-  if (maybe_name) { return emitter_receiver_map_.count(maybe_name.value()) > 0 ? true : false; }
+  if (maybe_name) { return emitter_receiver_map_.count(maybe_name.value()) > 0; }
   return false;
 }
 
@@ -51,7 +51,7 @@ const EmitterReceiverRegistry::EmitterReceiver& EmitterReceiverRegistry::get_emi
     HOLOSCAN_LOG_WARN("No emitter_receiver for name '{}' exists", name);
     return EmitterReceiverRegistry::none_emitter_receiver;
   }
-  auto& emitter_receiver = loc->second;
+  const auto& emitter_receiver = loc->second;
   return emitter_receiver;
 }
 
@@ -62,7 +62,7 @@ const EmitterReceiverRegistry::EmitFunc& EmitterReceiverRegistry::get_emitter(
     HOLOSCAN_LOG_WARN("No emitter for name '{}' exists", name);
     return EmitterReceiverRegistry::none_emit;
   }
-  auto& emitter_receiver = loc->second;
+  const auto& emitter_receiver = loc->second;
   return emitter_receiver.first;
 }
 
@@ -73,7 +73,7 @@ const EmitterReceiverRegistry::EmitFunc& EmitterReceiverRegistry::get_emitter(
     HOLOSCAN_LOG_WARN("No emitter for type '{}' exists", index.name());
     return EmitterReceiverRegistry::none_emit;
   }
-  auto& emitter = emitter_receiver_map_.at(maybe_name.value()).first;
+  const auto& emitter = emitter_receiver_map_.at(maybe_name.value()).first;
   return emitter;
 }
 
@@ -84,7 +84,7 @@ const EmitterReceiverRegistry::ReceiveFunc& EmitterReceiverRegistry::get_receive
     HOLOSCAN_LOG_WARN("No receiver for name '{}' exists", name);
     return EmitterReceiverRegistry::none_receive;
   }
-  auto& emitter_receiver = loc->second;
+  const auto& emitter_receiver = loc->second;
   return emitter_receiver.second;
 }
 
@@ -95,7 +95,7 @@ const EmitterReceiverRegistry::ReceiveFunc& EmitterReceiverRegistry::get_receive
     HOLOSCAN_LOG_WARN("No receiver for type '{}' exists", index.name());
     return EmitterReceiverRegistry::none_receive;
   }
-  auto& receiver = emitter_receiver_map_.at(maybe_name.value()).second;
+  const auto& receiver = emitter_receiver_map_.at(maybe_name.value()).second;
   return receiver;
 }
 
@@ -122,7 +122,7 @@ expected<std::string, RuntimeError> EmitterReceiverRegistry::index_to_name(
 std::vector<std::string> EmitterReceiverRegistry::registered_types() const {
   std::vector<std::string> names;
   names.reserve(emitter_receiver_map_.size());
-  for (auto& [key, _] : emitter_receiver_map_) { names.emplace_back(key); }
+  for (const auto& [key, _] : emitter_receiver_map_) { names.emplace_back(key); }
   return names;
 }
 
diff --git a/python/holoscan/core/emitter_receiver_registry.hpp b/python/holoscan/core/emitter_receiver_registry.hpp
index e4699f8c..db6f0c51 100644
--- a/python/holoscan/core/emitter_receiver_registry.hpp
+++ b/python/holoscan/core/emitter_receiver_registry.hpp
@@ -36,8 +36,6 @@
 #include "holoscan/logger/logger.hpp"
 #include "io_context.hpp"
 
-using std::string_literals::operator""s;
-
 namespace py = pybind11;
 
 namespace holoscan {
diff --git a/python/holoscan/core/emitter_receivers.hpp b/python/holoscan/core/emitter_receivers.hpp
index c8c0d554..2e1acf5d 100644
--- a/python/holoscan/core/emitter_receivers.hpp
+++ b/python/holoscan/core/emitter_receivers.hpp
@@ -39,8 +39,6 @@
 #include "io_context.hpp"
 #include "tensor.hpp"  // for PyTensor
 
-using std::string_literals::operator""s;
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -367,10 +365,10 @@ struct emitter_receiver<CloudPickleSerializedObject> {
  * A Python operator receiving a C++ nullptr will convert it to Python's None.
  */
 template <>
-struct emitter_receiver<nullptr_t> {
+struct emitter_receiver<std::nullptr_t> {
   static void emit(py::object& data, const std::string& name, PyOutputContext& op_output,
                    const int64_t acq_timestamp = -1) {
-    op_output.emit<nullptr_t>(nullptr, name.c_str(), acq_timestamp);
+    op_output.emit<std::nullptr_t>(nullptr, name.c_str(), acq_timestamp);
     return;
   }
   static py::object receive(std::any result, const std::string& name, PyInputContext& op_input) {
diff --git a/python/holoscan/core/execution_context.cpp b/python/holoscan/core/execution_context.cpp
index 224b178b..0be9cbb5 100644
--- a/python/holoscan/core/execution_context.cpp
+++ b/python/holoscan/core/execution_context.cpp
@@ -25,13 +25,12 @@
 #include "execution_context_pydoc.hpp"
 #include "holoscan/core/execution_context.hpp"
 
-using pybind11::literals::operator""_a;
-
 namespace py = pybind11;
 
 namespace holoscan {
 
 void init_execution_context(py::module_& m) {
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<ExecutionContext, std::shared_ptr<ExecutionContext>>(
       m, "ExecutionContext", doc::ExecutionContext::doc_ExecutionContext);
 
diff --git a/python/holoscan/core/executor.cpp b/python/holoscan/core/executor.cpp
index e05a69f4..71a70586 100644
--- a/python/holoscan/core/executor.cpp
+++ b/python/holoscan/core/executor.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,7 +24,7 @@
 #include "holoscan/core/executor.hpp"
 #include "holoscan/core/fragment.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
diff --git a/python/holoscan/core/fragment.cpp b/python/holoscan/core/fragment.cpp
index 674168a3..c14d8dcd 100644
--- a/python/holoscan/core/fragment.cpp
+++ b/python/holoscan/core/fragment.cpp
@@ -37,7 +37,7 @@
 #include "holoscan/core/scheduler.hpp"
 #include "kwarg_handling.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -59,10 +59,11 @@ void init_fragment(py::module_& m) {
       m, "Fragment", py::dynamic_attr(), doc::Fragment::doc_Fragment)
       .def(py::init<py::object>(), doc::Fragment::doc_Fragment)
       // notation for this name setter is a bit tricky (couldn't seem to do it with overload_cast)
-      .def_property("name",
-                    py::overload_cast<>(&Fragment::name, py::const_),
-                    (Fragment & (Fragment::*)(const std::string&)&)&Fragment::name,
-                    doc::Fragment::doc_name)
+      .def_property(
+          "name",
+          py::overload_cast<>(&Fragment::name, py::const_),
+          [](Fragment& f, const std::string& name) -> Fragment& { return f.name(name); },
+          doc::Fragment::doc_name)
       .def_property("application",
                     py::overload_cast<>(&Fragment::application, py::const_),
                     py::overload_cast<Application*>(&Fragment::application),
@@ -101,9 +102,9 @@ void init_fragment(py::module_& m) {
            &Fragment::add_operator,
            "op"_a,
            doc::Fragment::doc_add_operator)  // note: virtual function
-      // TODO: sphinx API doc build complains if more than one overloaded add_flow method has a
-      //       docstring specified. For now using the docstring defined for 3-argument
-      //       Operator-based version and describing the other variants in the Notes section.
+      // TODO(unknown): sphinx API doc build complains if more than one overloaded add_flow method
+      // has a docstring specified. For now using the docstring defined for 3-argument
+      // Operator-based version and describing the other variants in the Notes section.
       .def(  // note: virtual function
           "add_flow",
           py::overload_cast<const std::shared_ptr<Operator>&, const std::shared_ptr<Operator>&>(
@@ -133,11 +134,11 @@ void init_fragment(py::module_& m) {
            py::overload_cast<>(&Fragment::network_context),
            doc::Fragment::doc_network_context)
       .def("track",
-           &Application::track,
+           &Fragment::track,
            "num_start_messages_to_skip"_a = kDefaultNumStartMessagesToSkip,
            "num_last_messages_to_discard"_a = kDefaultNumLastMessagesToDiscard,
            "latency_threshold"_a = kDefaultLatencyThreshold,
-           doc::Application::doc_track,
+           doc::Fragment::doc_track,
            py::return_value_policy::reference_internal)
       .def_property("is_metadata_enabled",
                     py::overload_cast<>(&Fragment::is_metadata_enabled, py::const_),
@@ -158,7 +159,7 @@ void init_fragment(py::module_& m) {
           R"doc(Return repr(self).)doc");
 }
 
-PyFragment::PyFragment(py::object op) : Fragment() {
+PyFragment::PyFragment(const py::object& op) {
   py::gil_scoped_acquire scope_guard;
   py_compose_ = py::getattr(op, "compose");
 }
diff --git a/python/holoscan/core/fragment.hpp b/python/holoscan/core/fragment.hpp
index bcbe5a77..74f78ad2 100644
--- a/python/holoscan/core/fragment.hpp
+++ b/python/holoscan/core/fragment.hpp
@@ -58,7 +58,7 @@ class PyFragment : public Fragment {
   /* Inherit the constructors */
   using Fragment::Fragment;
 
-  explicit PyFragment(py::object op);
+  explicit PyFragment(const py::object& op);
 
   /* Trampolines (need one for each virtual function) */
   void add_operator(const std::shared_ptr<Operator>& op) override;
diff --git a/python/holoscan/core/fragment_pydoc.hpp b/python/holoscan/core/fragment_pydoc.hpp
index 785bca73..4b541a18 100644
--- a/python/holoscan/core/fragment_pydoc.hpp
+++ b/python/holoscan/core/fragment_pydoc.hpp
@@ -229,6 +229,12 @@ num_last_messages_to_discard : int
 latency_threshold : int
     The minimum end-to-end latency in milliseconds to account for in the
     end-to-end latency metric calculations
+
+Returns
+-------
+tracker : holoscan.core.DataFlowTracker
+    The data flow tracker object that can be used to display metrics data for profiling along the
+    different paths through the computation graph.
 )doc")
 
 PYDOC(run, R"doc(
diff --git a/python/holoscan/core/io_context.cpp b/python/holoscan/core/io_context.cpp
index 18061a02..203ba14c 100644
--- a/python/holoscan/core/io_context.cpp
+++ b/python/holoscan/core/io_context.cpp
@@ -44,8 +44,8 @@
 #include "operator.hpp"  // for PyOperator
 #include "tensor.hpp"    // for PyTensor
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -61,9 +61,10 @@ class PyRegistryContext {
   EmitterReceiverRegistry& registry_ = EmitterReceiverRegistry::get_instance();
 };
 
+// NOLINTBEGIN(altera-struct-pack-align)
 template <>
 struct codec<std::shared_ptr<GILGuardedPyObject>> {
-  static expected<size_t, RuntimeError> serialize(std::shared_ptr<GILGuardedPyObject> value,
+  static expected<size_t, RuntimeError> serialize(const std::shared_ptr<GILGuardedPyObject>& value,
                                                   Endpoint* endpoint) {
     HOLOSCAN_LOG_TRACE("py_emit: cloudpickle serialization of Python object over a UCX connector");
     std::string serialized_string;
@@ -96,6 +97,7 @@ struct codec<std::shared_ptr<GILGuardedPyObject>> {
     return std::move(maybe_obj.value());
   }
 };
+// NOLINTEND(altera-struct-pack-align)
 
 static void register_py_object_codec() {
   auto& codec_registry = CodecRegistry::get_instance();
@@ -103,8 +105,9 @@ static void register_py_object_codec() {
       "std::shared_ptr<GILGuardedPyObject>"s);
 }
 
+// NOLINTBEGIN(readability-function-cognitive-complexity)
 py::object PyInputContext::py_receive(const std::string& name, const std::string& kind) {
-  auto py_op = py_op_.cast<PyOperator*>();
+  auto* py_op = py_op_.cast<PyOperator*>();
   auto py_op_spec = py_op->py_shared_spec();
 
   bool should_return_tuple = false;
@@ -160,16 +163,16 @@ py::object PyInputContext::py_receive(const std::string& name, const std::string
 
     // Check element type (querying the first element using the name '{name}:0')
     auto& element = any_result[0];
-    auto& element_type = element.type();
+    const auto& element_type = element.type();
     auto& registry = holoscan::EmitterReceiverRegistry::get_instance();
     const auto& receiver_func = registry.get_receiver(element_type);
 
     py::tuple result_tuple(any_result.size());
     int counter = 0;
     try {
-      for (auto& any_item : any_result) {
-        auto& item_type = any_item.type();
-        if (item_type == typeid(kNoReceivedMessage) || item_type == typeid(nullptr_t)) {
+      for (const auto& any_item : any_result) {
+        const auto& item_type = any_item.type();
+        if (item_type == typeid(kNoReceivedMessage) || item_type == typeid(std::nullptr_t)) {
           // add None to the tuple
           PyTuple_SET_ITEM(result_tuple.ptr(), counter++, py::none().release().ptr());
           continue;
@@ -188,18 +191,17 @@ py::object PyInputContext::py_receive(const std::string& name, const std::string
           e.what());
     }
     return result_tuple;
-  } else {
-    auto maybe_result = receive<std::any>(name.c_str());
-    if (!maybe_result.has_value()) {
-      HOLOSCAN_LOG_DEBUG("Unable to receive input (std::any) with name '{}'", name);
-      return py::none();
-    }
-    auto result = maybe_result.value();
-    auto& result_type = result.type();
-    auto& registry = holoscan::EmitterReceiverRegistry::get_instance();
-    const auto& receiver_func = registry.get_receiver(result_type);
-    return receiver_func(result, name, *this);
   }
+  auto maybe_result = receive<std::any>(name.c_str());
+  if (!maybe_result.has_value()) {
+    HOLOSCAN_LOG_DEBUG("Unable to receive input (std::any) with name '{}'", name);
+    return py::none();
+  }
+  auto result = maybe_result.value();
+  const auto& result_type = result.type();
+  auto& registry = holoscan::EmitterReceiverRegistry::get_instance();
+  const auto& receiver_func = registry.get_receiver(result_type);
+  return receiver_func(result, name, *this);
 }
 
 void PyOutputContext::py_emit(py::object& data, const std::string& name,
@@ -275,10 +277,10 @@ void PyOutputContext::py_emit(py::object& data, const std::string& name,
   } else {
     // If this operator doesn't have a UCX connector, can still determine if the app is
     // a multi-fragment app via the application pointer assigned to the fragment.
-    auto py_op = py_op_.cast<PyOperator*>();
+    auto* py_op = py_op_.cast<PyOperator*>();
     auto py_op_spec = py_op->py_shared_spec();
-    auto app_ptr = py_op_spec->fragment()->application();
-    if (app_ptr) {
+    auto* app_ptr = py_op_spec->fragment()->application();
+    if (app_ptr != nullptr) {
       // a non-empty fragment graph means that the application is multi-fragment
       if (!(app_ptr->fragment_graph().is_empty())) { is_distributed_app = true; }
     }
@@ -311,10 +313,10 @@ void PyOutputContext::py_emit(py::object& data, const std::string& name,
   HOLOSCAN_LOG_DEBUG("py_emit: emitting a std::shared_ptr<GILGuardedPyObject>");
   const auto& emit_func = registry.get_emitter(typeid(std::shared_ptr<GILGuardedPyObject>));
   emit_func(data, name, *this, acq_timestamp);
-  return;
 }
 
 void init_io_context(py::module_& m) {
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<Message>(m, "Message", doc::Message::doc_Message);
 
   py::class_<InputContext, std::shared_ptr<InputContext>> input_context(
@@ -368,7 +370,7 @@ void init_io_context(py::module_& m) {
   // types. For user-defined operators that need to add additional types, the registry can be
   // imported from holoscan.core. See the holoscan.operators.HolovizOp source for an example.
   m.def("register_types", [](EmitterReceiverRegistry& registry) {
-    registry.add_emitter_receiver<nullptr_t>("nullptr_t"s, true);
+    registry.add_emitter_receiver<std::nullptr_t>("nullptr_t"s, true);
     registry.add_emitter_receiver<CloudPickleSerializedObject>("CloudPickleSerializedObject"s,
                                                                true);
     registry.add_emitter_receiver<std::string>("std::string"s, true);
@@ -398,6 +400,7 @@ void init_io_context(py::module_& m) {
            "Return a reference to the static EmitterReceiverRegistry",
            py::return_value_policy::reference_internal);
 }
+// NOLINTEND(readability-function-cognitive-complexity)
 
 PyInputContext::PyInputContext(ExecutionContext* execution_context, Operator* op,
                                std::unordered_map<std::string, std::shared_ptr<IOSpec>>& inputs,
diff --git a/python/holoscan/core/io_spec.cpp b/python/holoscan/core/io_spec.cpp
index 1b07a935..d0751b1f 100644
--- a/python/holoscan/core/io_spec.cpp
+++ b/python/holoscan/core/io_spec.cpp
@@ -31,8 +31,7 @@
 #include "io_spec_pydoc.hpp"
 #include "kwarg_handling.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -87,9 +86,9 @@ void init_io_spec(py::module_& m) {
           },
           doc::IOSpec::doc_condition,
           py::return_value_policy::reference_internal)
-      // TODO: sphinx API doc build complains if more than one connector
-      //       method has a docstring specified. For now just set the docstring for the
-      //       first overload only and add information about the rest in the Notes section.
+      // TODO(unknown): sphinx API doc build complains if more than one connector
+      // method has a docstring specified. For now just set the docstring for the
+      // first overload only and add information about the rest in the Notes section.
       .def(
           "connector",
           // Note: The return type needs to be specified explicitly because pybind11 can't deduce it
@@ -114,12 +113,12 @@ void init_io_spec(py::module_& m) {
   // Define IOSize constants in IOSpec module
   iospec
       .def_property_readonly_static(
-          "ANY_SIZE", [](py::object) { return IOSpec::kAnySize; }, "Any size")
+          "ANY_SIZE", [](const py::object&) { return IOSpec::kAnySize; }, "Any size")
       .def_property_readonly_static(
           "PRECEDING_COUNT",
-          [](py::object) { return IOSpec::kPrecedingCount; },
+          [](const py::object&) { return IOSpec::kPrecedingCount; },
           "Number of preceding connections")
       .def_property_readonly_static(
-          "SIZE_ONE", [](py::object) { return IOSpec::kSizeOne; }, "Size one");
+          "SIZE_ONE", [](const py::object&) { return IOSpec::kSizeOne; }, "Size one");
 }
 }  // namespace holoscan
diff --git a/python/holoscan/core/kwarg_handling.cpp b/python/holoscan/core/kwarg_handling.cpp
index 0232a34c..80eacabc 100644
--- a/python/holoscan/core/kwarg_handling.cpp
+++ b/python/holoscan/core/kwarg_handling.cpp
@@ -19,6 +19,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>  // needed for py::cast to work with std::vector types
 
+#include <complex>
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -32,8 +33,8 @@
 #include "kwarg_handling.hpp"
 #include "kwarg_handling_pydoc.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -64,10 +65,9 @@ inline YAML::Node cast_to_yaml_node<int8_t>(const py::handle& obj) {
 }
 
 void set_scalar_arg_via_dtype(const py::object& obj, const py::dtype& dt, Arg& out) {
-  std::string dtype_name = dt.attr("name").cast<std::string>();
-  if (dtype_name == "float16") {  // currently promoting float16 scalars to float
-    out = cast_to_yaml_node<float>(obj);
-  } else if (dtype_name == "float32") {
+  auto dtype_name = dt.attr("name").cast<std::string>();
+  if (dtype_name == "float16" || dtype_name == "float32") {
+    // currently promoting float16 scalars to float
     out = cast_to_yaml_node<float>(obj);
   } else if (dtype_name == "float64") {
     out = cast_to_yaml_node<double>(obj);
@@ -92,7 +92,6 @@ void set_scalar_arg_via_dtype(const py::object& obj, const py::dtype& dt, Arg& o
   } else {
     throw std::runtime_error("unsupported dtype: "s + dtype_name + ", leaving Arg uninitialized"s);
   }
-  return;
 }
 
 template <typename T>
@@ -101,7 +100,7 @@ void set_vector_arg_via_numpy_array(const py::array& obj, Arg& out) {
   // for short arrays containing parameter settings to operators/resources
   if (obj.attr("ndim").cast<int>() == 1) {
     YAML::Node yaml_node = YAML::Load("[]");  // Create an empty sequence
-    for (const auto& item : obj) yaml_node.push_back(cast_to_yaml_node<T>(item));
+    for (const auto& item : obj) { yaml_node.push_back(cast_to_yaml_node<T>(item)); }
     out = yaml_node;
   } else if (obj.attr("ndim").cast<int>() == 2) {
     YAML::Node yaml_node = YAML::Load("[]");  // Create an empty sequence
@@ -118,6 +117,7 @@ void set_vector_arg_via_numpy_array(const py::array& obj, Arg& out) {
   }
 }
 
+// NOLINTBEGIN(readability-function-cognitive-complexity)
 template <typename T>
 void set_vector_arg_via_py_sequence(const py::sequence& seq, Arg& out) {
   // not intended for images or other large tensors, just
@@ -142,7 +142,7 @@ void set_vector_arg_via_py_sequence(const py::sequence& seq, Arg& out) {
       std::vector<T> v;
       size_t length = py::len(seq);
       v.reserve(length);
-      for (const auto& item : seq) v.push_back(item.cast<T>());
+      for (const auto& item : seq) { v.push_back(item.cast<T>()); }
       out = v;
     }
   } else {
@@ -161,11 +161,12 @@ void set_vector_arg_via_py_sequence(const py::sequence& seq, Arg& out) {
     } else {
       // 1d vector to handle a sequence of elements
       YAML::Node yaml_node = YAML::Load("[]");  // Create an empty sequence
-      for (const auto& item : seq) yaml_node.push_back(cast_to_yaml_node<T>(item));
+      for (const auto& item : seq) { yaml_node.push_back(cast_to_yaml_node<T>(item)); }
       out = yaml_node;
     }
   }
 }
+// NOLINTEND(readability-function-cognitive-complexity)
 
 void set_vector_arg_via_iterable(const py::object& obj, Arg& out) {
   py::sequence seq;
@@ -200,7 +201,7 @@ void set_vector_arg_via_iterable(const py::object& obj, Arg& out) {
       throw std::runtime_error("Nested sequence of unsupported type.");
     }
   } else {
-    auto item = item0;
+    const auto& item = item0;
     if (py::isinstance<py::bool_>(item)) {
       set_vector_arg_via_py_sequence<bool>(seq, out);
     } else if (py::isinstance<py::int_>(item)) {
@@ -215,14 +216,12 @@ void set_vector_arg_via_iterable(const py::object& obj, Arg& out) {
       set_vector_arg_via_py_sequence<std::shared_ptr<Condition>>(seq, out);
     }
   }
-  return;
 }
 
 void set_vector_arg_via_dtype(const py::object& obj, const py::dtype& dt, Arg& out) {
-  std::string dtype_name = dt.attr("name").cast<std::string>();
-  if (dtype_name == "float16") {  // currently promoting float16 scalars to float
-    set_vector_arg_via_numpy_array<float>(obj, out);
-  } else if (dtype_name == "float32") {
+  auto dtype_name = dt.attr("name").cast<std::string>();
+  if (dtype_name == "float16" || dtype_name == "float32") {
+    // currently promoting float16 scalars to float
     set_vector_arg_via_numpy_array<float>(obj, out);
   } else if (dtype_name == "float64") {
     set_vector_arg_via_numpy_array<double>(obj, out);
@@ -244,13 +243,12 @@ void set_vector_arg_via_dtype(const py::object& obj, const py::dtype& dt, Arg& o
     set_vector_arg_via_numpy_array<uint32_t>(obj, out);
   } else if (dtype_name == "uint64") {
     set_vector_arg_via_numpy_array<uint64_t>(obj, out);
-  } else if (dtype_name.find("str") == 0) {
-    py::list list_obj = obj.attr("tolist")().cast<py::list>();
+  } else if (dtype_name.find("str") == 0) {  // NOLINT(abseil-string-find-startswith)
+    auto list_obj = obj.attr("tolist")().cast<py::list>();
     // TODO(grelee): set_vector_arg_via_seqeuence(list_obj, out);
   } else {
     throw std::runtime_error("unsupported dtype: "s + dtype_name + ", leaving Arg uninitialized"s);
   }
-  return;
 }
 
 template <typename T>
@@ -262,43 +260,47 @@ py::object vector_arg_to_py_object(Arg& arg) {
   }
 }
 
-py::object yaml_node_to_py_object(YAML::Node node) {
+// NOLINTBEGIN(misc-no-recursion)
+py::object yaml_node_to_py_object(const YAML::Node& node) {
   if (node.IsSequence()) {
     py::list list;
     for (const auto& item : node) { list.append(yaml_node_to_py_object(item)); }
     return list;
-  } else if (node.IsMap()) {
+  }
+  if (node.IsMap()) {
     py::dict dict;
     for (const auto& item : node) {
       dict[py::str(item.first.as<std::string>())] = yaml_node_to_py_object(item.second);
     }
     return dict;
-  } else if (node.IsScalar()) {
+  }
+  if (node.IsScalar()) {
     // Check if it is null.
     if (node.IsNull()) { return py::none(); }
     // Check if it is an integer.
     {
-      int64_t t;
+      int64_t t{};
       if (YAML::convert<int64_t>::decode(node, t)) { return py::int_(t); }
     }
     // Check if it is a float.
     {
-      double t;
+      double t{};
       if (YAML::convert<double>::decode(node, t)) { return py::float_(t); }
     }
     // Check if it is a boolean.
     {
-      bool t;
+      bool t{};
       if (YAML::convert<bool>::decode(node, t)) { return py::bool_(t); }
     }
     // Check if it is a string.
     {
-      std::string t;
+      std::string t{};
       if (YAML::convert<std::string>::decode(node, t)) { return py::str(t); }
     }
   }
   return py::none();
 }
+// NOLINTEND(misc-no-recursion)
 
 py::object arg_to_py_object(Arg& arg) {
   // Takes an Arg as input and returns an appropriate Python object equivalent.
@@ -306,6 +308,7 @@ py::object arg_to_py_object(Arg& arg) {
   auto t = arg.arg_type();
   auto container_type = t.container_type();
   auto element_type = t.element_type();
+  // NOLINTBEGIN(clang-diagnostic-switch)
   if (container_type == ArgContainerType::kNative) {
     switch (element_type) {
       case ArgElementType::kBoolean:
@@ -330,6 +333,10 @@ py::object arg_to_py_object(Arg& arg) {
         return py::cast(std::any_cast<uint32_t>(arg.value()));
       case ArgElementType::kUnsigned64:
         return py::cast(std::any_cast<uint64_t>(arg.value()));
+      case ArgElementType::kComplex64:
+        return py::cast(std::any_cast<std::complex<float>>(arg.value()));
+      case ArgElementType::kComplex128:
+        return py::cast(std::any_cast<std::complex<double>>(arg.value()));
       case ArgElementType::kString:
         return py::cast(std::any_cast<std::string>(arg.value()));
       case ArgElementType::kYAMLNode: {
@@ -362,11 +369,16 @@ py::object arg_to_py_object(Arg& arg) {
         return vector_arg_to_py_object<uint32_t>(arg);
       case ArgElementType::kUnsigned64:
         return vector_arg_to_py_object<uint64_t>(arg);
+      case ArgElementType::kComplex64:
+        return vector_arg_to_py_object<std::complex<float>>(arg);
+      case ArgElementType::kComplex128:
+        return vector_arg_to_py_object<std::complex<double>>(arg);
       case ArgElementType::kString:
         return vector_arg_to_py_object<std::string>(arg);
     }
     // Not handled here: kHandle, kCustom, kIOSpec, kCondition, kResource, kYAMLNode
   }
+  // NOLINTEND(clang-diagnostic-switch)
 
   throw std::runtime_error(fmt::format(
       "Unable to convert Arg (name: {}, container_type: {}, element_type: {}) to Python object",
@@ -375,7 +387,7 @@ py::object arg_to_py_object(Arg& arg) {
       static_cast<int>(element_type)));
 }
 
-Arg py_object_to_arg(py::object obj, std::string name = "") {
+Arg py_object_to_arg(py::object obj, const std::string& name = ""s) {
   Arg out(name);
   if (py::isinstance<py::str>(obj)) {
     out = cast_to_yaml_node<std::string>(obj);
@@ -396,7 +408,7 @@ Arg py_object_to_arg(py::object obj, std::string name = "") {
     out = cast_to_yaml_node<double>(obj);
   } else if (PyComplex_Check(obj.ptr())) {
     throw std::runtime_error("complex value cannot be converted to Arg");
-  } else if (PyNumber_Check(obj.ptr())) {
+  } else if (PyNumber_Check(obj.ptr()) == 1) {
     py::module_ np = py::module_::import("numpy");
     auto numpy_generic = np.attr("generic");
     if (py::isinstance(obj, numpy_generic)) {
@@ -404,10 +416,9 @@ Arg py_object_to_arg(py::object obj, std::string name = "") {
       py::dtype dt = np.attr("dtype")(obj);
       set_scalar_arg_via_dtype(obj, dt, out);
       return out;
-    } else {
-      // cast any other unknown numeric type to double
-      out = cast_to_yaml_node<double>(obj);
     }
+    // cast any other unknown numeric type to double
+    out = cast_to_yaml_node<double>(obj);
   } else if (py::isinstance<Resource>(obj)) {
     out = obj.cast<std::shared_ptr<Resource>>();
   } else if (py::isinstance<Condition>(obj)) {
diff --git a/python/holoscan/core/kwarg_handling.hpp b/python/holoscan/core/kwarg_handling.hpp
index 1e195c43..047b9ed2 100644
--- a/python/holoscan/core/kwarg_handling.hpp
+++ b/python/holoscan/core/kwarg_handling.hpp
@@ -42,9 +42,9 @@ template <typename T>
 void set_vector_arg_via_py_sequence(const py::sequence&, Arg&);
 
 void set_vector_arg_via_iterable(const py::object&, Arg&);
-Arg py_object_to_arg(py::object, std::string);
+Arg py_object_to_arg(py::object, const std::string&);
 ArgList kwargs_to_arglist(const py::kwargs&);
-py::object yaml_node_to_py_object(YAML::Node node);
+py::object yaml_node_to_py_object(const YAML::Node& node);
 py::object arg_to_py_object(Arg&);
 py::dict arglist_to_kwargs(ArgList&);
 
diff --git a/python/holoscan/core/metadata.cpp b/python/holoscan/core/metadata.cpp
index f0c99fff..9cf287c5 100644
--- a/python/holoscan/core/metadata.cpp
+++ b/python/holoscan/core/metadata.cpp
@@ -31,18 +31,18 @@
 #include "kwarg_handling.hpp"
 #include "metadata_pydoc.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
 // use a special class to differentiate a default value from Python's None
-class _NoneValue {};
+class MetaNoneValue {};
 
 namespace holoscan {
 
 void set_scalar_metadata_via_dtype(const py::object& obj, const py::dtype& dt,
                                    MetadataObject& out) {
-  std::string dtype_name = dt.attr("name").cast<std::string>();
+  auto dtype_name = dt.attr("name").cast<std::string>();
   if (dtype_name == "float32") {
     out.set_value(obj.cast<float>());
   } else if (dtype_name == "float64") {
@@ -72,7 +72,6 @@ void set_scalar_metadata_via_dtype(const py::object& obj, const py::dtype& dt,
   } else {
     throw std::runtime_error("unsupported dtype: "s + dtype_name);
   }
-  return;
 }
 
 template <typename T>
@@ -82,16 +81,16 @@ void set_vector_metadata_via_numpy_array(const py::array& obj, MetadataObject& o
   if (obj.attr("ndim").cast<int>() == 1) {
     std::vector<T> v;
     v.reserve(obj.attr("size").cast<size_t>());
-    for (auto item : obj) v.push_back(item.cast<T>());
+    for (const auto& item : obj) { v.push_back(item.cast<T>()); }
     out.set_value(v);
   } else if (obj.attr("ndim").cast<int>() == 2) {
     std::vector<std::vector<T>> v;
-    std::vector<py::ssize_t> shape = obj.attr("shape").cast<std::vector<py::ssize_t>>();
+    auto shape = obj.attr("shape").cast<std::vector<py::ssize_t>>();
     v.reserve(static_cast<size_t>(shape[0]));
-    for (auto item : obj) {
+    for (const auto& item : obj) {
       std::vector<T> vv;
       vv.reserve(static_cast<size_t>(shape[1]));
-      for (auto inner_item : item) { vv.push_back(inner_item.cast<T>()); }
+      for (const auto& inner_item : item) { vv.push_back(inner_item.cast<T>()); }
       v.push_back(vv);
     }
     out.set_value(v);
@@ -110,10 +109,10 @@ void set_vector_metadata_via_py_sequence(const py::sequence& seq, MetadataObject
     // Handle list of list and other sequence of sequence types.
     std::vector<std::vector<T>> v;
     v.reserve(static_cast<size_t>(py::len(seq)));
-    for (auto item : seq) {
+    for (const auto& item : seq) {
       std::vector<T> vv;
       vv.reserve(static_cast<size_t>(py::len(item)));
-      for (auto inner_item : item) { vv.push_back(inner_item.cast<T>()); }
+      for (const auto& inner_item : item) { vv.push_back(inner_item.cast<T>()); }
       v.push_back(vv);
     }
     out.set_value(v);
@@ -122,7 +121,7 @@ void set_vector_metadata_via_py_sequence(const py::sequence& seq, MetadataObject
     std::vector<T> v;
     size_t length = py::len(seq);
     v.reserve(length);
-    for (auto item : seq) v.push_back(item.cast<T>());
+    for (const auto& item : seq) { v.push_back(item.cast<T>()); }
     out.set_value(v);
   }
 }
@@ -160,7 +159,7 @@ void set_vector_metadata_via_iterable(const py::object& obj, MetadataObject& out
       throw std::runtime_error("Nested sequence of unsupported type.");
     }
   } else {
-    auto item = item0;
+    const auto& item = item0;
     if (py::isinstance<py::bool_>(item)) {
       set_vector_metadata_via_py_sequence<bool>(seq, out);
     } else if (py::isinstance<py::int_>(item)) {
@@ -171,12 +170,11 @@ void set_vector_metadata_via_iterable(const py::object& obj, MetadataObject& out
       set_vector_metadata_via_py_sequence<std::string>(seq, out);
     }
   }
-  return;
 }
 
 void set_vector_metadata_via_dtype(const py::object& obj, const py::dtype& dt,
                                    MetadataObject& out) {
-  std::string dtype_name = dt.attr("name").cast<std::string>();
+  auto dtype_name = dt.attr("name").cast<std::string>();
   if (dtype_name == "float32") {
     set_vector_metadata_via_numpy_array<float>(obj, out);
   } else if (dtype_name == "float64") {
@@ -206,7 +204,6 @@ void set_vector_metadata_via_dtype(const py::object& obj, const py::dtype& dt,
   } else {
     throw std::runtime_error("unsupported dtype: "s + dtype_name);
   }
-  return;
 }
 
 void py_object_to_metadata_object(MetadataObject& meta_obj, const py::object& value,
@@ -246,9 +243,9 @@ void py_object_to_metadata_object(MetadataObject& meta_obj, const py::object& va
     auto data_ptr = std::make_shared<GILGuardedPyObject>(value);
     meta_obj.set_value(data_ptr);
   }
-  return;
 }
 
+// NOLINTBEGIN(readability-function-cognitive-complexity)
 py::object metadata_obj_to_pyobject(MetadataObject& meta_obj) {
   std::any value = meta_obj.value();
   const auto& id = value.type();
@@ -258,97 +255,112 @@ py::object metadata_obj_to_pyobject(MetadataObject& meta_obj) {
   }
   // For C++ types, support casting T, vector<T>, and vector<vector<<T>> types
   // where T is either std::string, bool or various integer or floating point types.
-  if (id == typeid(std::string)) {
-    return py::cast(std::any_cast<std::string>(value));
-  } else if (id == typeid(float)) {
-    return py::cast(std::any_cast<float>(value));
-  } else if (id == typeid(double)) {
-    return py::cast(std::any_cast<double>(value));
-  } else if (id == typeid(bool)) {
-    return py::cast(std::any_cast<bool>(value));
-  } else if (id == typeid(int64_t)) {
-    return py::cast(std::any_cast<int64_t>(value));
-  } else if (id == typeid(uint64_t)) {
-    return py::cast(std::any_cast<uint64_t>(value));
-  } else if (id == typeid(int32_t)) {
-    return py::cast(std::any_cast<int32_t>(value));
-  } else if (id == typeid(uint32_t)) {
-    return py::cast(std::any_cast<uint32_t>(value));
-  } else if (id == typeid(int16_t)) {
-    return py::cast(std::any_cast<int16_t>(value));
-  } else if (id == typeid(uint16_t)) {
-    return py::cast(std::any_cast<uint16_t>(value));
-  } else if (id == typeid(int8_t)) {
-    return py::cast(std::any_cast<int8_t>(value));
-  } else if (id == typeid(uint8_t)) {
-    return py::cast(std::any_cast<uint8_t>(value));
-  } else if (id == typeid(std::complex<float>)) {
+  if (id == typeid(std::string)) { return py::cast(std::any_cast<std::string>(value)); }
+  if (id == typeid(float)) { return py::cast(std::any_cast<float>(value)); }
+  if (id == typeid(double)) { return py::cast(std::any_cast<double>(value)); }
+  if (id == typeid(bool)) { return py::cast(std::any_cast<bool>(value)); }
+  if (id == typeid(int64_t)) { return py::cast(std::any_cast<int64_t>(value)); }
+  if (id == typeid(uint64_t)) { return py::cast(std::any_cast<uint64_t>(value)); }
+  if (id == typeid(int32_t)) { return py::cast(std::any_cast<int32_t>(value)); }
+  if (id == typeid(uint32_t)) { return py::cast(std::any_cast<uint32_t>(value)); }
+  if (id == typeid(int16_t)) { return py::cast(std::any_cast<int16_t>(value)); }
+  if (id == typeid(uint16_t)) { return py::cast(std::any_cast<uint16_t>(value)); }
+  if (id == typeid(int8_t)) { return py::cast(std::any_cast<int8_t>(value)); }
+  if (id == typeid(uint8_t)) { return py::cast(std::any_cast<uint8_t>(value)); }
+  if (id == typeid(std::complex<float>)) {
     return py::cast(std::any_cast<std::complex<float>>(value));
-  } else if (id == typeid(std::complex<double>)) {
+  }
+  if (id == typeid(std::complex<double>)) {
     return py::cast(std::any_cast<std::complex<double>>(value));
-  } else if (id == typeid(std::vector<std::string>)) {
+  }
+  if (id == typeid(std::vector<std::string>)) {
     return py::cast(std::any_cast<std::vector<std::string>>(value));
-  } else if (id == typeid(std::vector<float>)) {
+  }
+  if (id == typeid(std::vector<float>)) {
     return py::cast(std::any_cast<std::vector<float>>(value));
-  } else if (id == typeid(std::vector<double>)) {
+  }
+  if (id == typeid(std::vector<double>)) {
     return py::cast(std::any_cast<std::vector<double>>(value));
-  } else if (id == typeid(std::vector<bool>)) {
-    return py::cast(std::any_cast<std::vector<bool>>(value));
-  } else if (id == typeid(std::vector<int64_t>)) {
+  }
+  if (id == typeid(std::vector<bool>)) { return py::cast(std::any_cast<std::vector<bool>>(value)); }
+  if (id == typeid(std::vector<int64_t>)) {
     return py::cast(std::any_cast<std::vector<int64_t>>(value));
-  } else if (id == typeid(std::vector<uint64_t>)) {
+  }
+  if (id == typeid(std::vector<uint64_t>)) {
     return py::cast(std::any_cast<std::vector<uint64_t>>(value));
-  } else if (id == typeid(std::vector<int32_t>)) {
+  }
+  if (id == typeid(std::vector<int32_t>)) {
     return py::cast(std::any_cast<std::vector<int32_t>>(value));
-  } else if (id == typeid(std::vector<uint32_t>)) {
+  }
+  if (id == typeid(std::vector<uint32_t>)) {
     return py::cast(std::any_cast<std::vector<uint32_t>>(value));
-  } else if (id == typeid(std::vector<int16_t>)) {
+  }
+  if (id == typeid(std::vector<int16_t>)) {
     return py::cast(std::any_cast<std::vector<int16_t>>(value));
-  } else if (id == typeid(std::vector<uint16_t>)) {
+  }
+  if (id == typeid(std::vector<uint16_t>)) {
     return py::cast(std::any_cast<std::vector<uint16_t>>(value));
-  } else if (id == typeid(std::vector<int8_t>)) {
+  }
+  if (id == typeid(std::vector<int8_t>)) {
     return py::cast(std::any_cast<std::vector<int8_t>>(value));
-  } else if (id == typeid(std::vector<uint8_t>)) {
+  }
+  if (id == typeid(std::vector<uint8_t>)) {
     return py::cast(std::any_cast<std::vector<uint8_t>>(value));
-  } else if (id == typeid(std::vector<std::complex<float>>)) {
+  }
+  if (id == typeid(std::vector<std::complex<float>>)) {
     return py::cast(std::any_cast<std::vector<std::complex<float>>>(value));
-  } else if (id == typeid(std::vector<std::complex<double>>)) {
+  }
+  if (id == typeid(std::vector<std::complex<double>>)) {
     return py::cast(std::any_cast<std::vector<std::complex<double>>>(value));
-  } else if (id == typeid(std::vector<std::vector<std::string>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<std::string>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<std::string>>>(value));
-  } else if (id == typeid(std::vector<std::vector<float>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<float>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<float>>>(value));
-  } else if (id == typeid(std::vector<std::vector<double>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<double>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<double>>>(value));
-  } else if (id == typeid(std::vector<std::vector<bool>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<bool>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<bool>>>(value));
-  } else if (id == typeid(std::vector<std::vector<int64_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<int64_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<int64_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<uint64_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<uint64_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<uint64_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<int32_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<int32_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<int32_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<uint32_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<uint32_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<uint32_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<int16_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<int16_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<int16_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<uint16_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<uint16_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<uint16_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<int8_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<int8_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<int8_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<uint8_t>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<uint8_t>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<uint8_t>>>(value));
-  } else if (id == typeid(std::vector<std::vector<std::complex<float>>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<std::complex<float>>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<std::complex<float>>>>(value));
-  } else if (id == typeid(std::vector<std::vector<std::complex<double>>>)) {
+  }
+  if (id == typeid(std::vector<std::vector<std::complex<double>>>)) {
     return py::cast(std::any_cast<std::vector<std::vector<std::complex<double>>>>(value));
-  } else {
-    return py::none();
   }
+  return py::none();
 }
+// NOLINTEND(readability-function-cognitive-complexity)
 
 void init_metadata(py::module_& m) {
-  py::class_<_NoneValue>(m, "_NoneValue").def(py::init<>());
+  py::class_<MetaNoneValue>(m, "MetaNoneValue").def(py::init<>());
 
   py::enum_<MetadataPolicy>(m, "MetadataPolicy", doc::MetadataPolicy::doc_MetadataPolicy)
       .value("REJECT", MetadataPolicy::kReject)
@@ -387,8 +399,8 @@ void init_metadata(py::module_& m) {
           [](MetadataDictionary& meta_dict) -> std::vector<std::pair<std::string, py::object>> {
             std::vector<std::pair<std::string, py::object>> items;
             items.reserve(meta_dict.size());
-            for (auto& [key, value] : meta_dict) {
-              items.push_back({key, metadata_obj_to_pyobject(*value)});
+            for (const auto& [key, value] : meta_dict) {
+              items.emplace_back(key, metadata_obj_to_pyobject(*value));
             }
             return items;
           },
@@ -397,7 +409,7 @@ void init_metadata(py::module_& m) {
           "type_dict",
           [](MetadataDictionary& meta_dict) -> py::dict {
             py::dict type_dict;
-            for (auto& [key, v] : meta_dict) {
+            for (const auto& [key, v] : meta_dict) {
               type_dict[py::str(key)] = py::str(v->value().type().name());
             }
             return type_dict;
@@ -409,11 +421,8 @@ void init_metadata(py::module_& m) {
              const std::string& key,
              const py::object& default_value = py::none()) -> py::object {
             if (!meta_dict.has_key(key)) {
-              if (py::isinstance<_NoneValue>(default_value)) {
-                throw py::key_error(key);
-              } else {
-                return default_value;
-              }
+              if (py::isinstance<MetaNoneValue>(default_value)) { throw py::key_error(key); }
+              return default_value;
             }
             auto meta_obj = meta_dict.get(key);
             auto result = metadata_obj_to_pyobject(*meta_obj);
@@ -421,7 +430,7 @@ void init_metadata(py::module_& m) {
             return result;
           },
           "key"_a,
-          "default"_a = _NoneValue(),
+          "default"_a = MetaNoneValue(),
           doc::MetadataDictionary::doc_pop)
       .def(
           "set",
@@ -432,7 +441,7 @@ void init_metadata(py::module_& m) {
              bool cast_to_cpp = false) {
             if (!cast_to_cpp) {
               auto data_ptr = std::make_shared<GILGuardedPyObject>(value);
-              meta_dict.set<std::shared_ptr<GILGuardedPyObject>>(key, data_ptr);
+              meta_dict.set<std::shared_ptr<GILGuardedPyObject>>(key, std::move(data_ptr));
             } else {
               auto meta_obj = std::make_shared<MetadataObject>();
               py_object_to_metadata_object(*meta_obj, value, dtype, cast_to_cpp);
@@ -447,7 +456,7 @@ void init_metadata(py::module_& m) {
       .def("__setitem__",
            [](MetadataDictionary& meta_dict, const std::string& key, py::object& value) {
              auto data_ptr = std::make_shared<GILGuardedPyObject>(value);
-             meta_dict.set<std::shared_ptr<GILGuardedPyObject>>(key, data_ptr);
+             meta_dict.set<std::shared_ptr<GILGuardedPyObject>>(key, std::move(data_ptr));
            })
       .def_property("policy",
                     py::overload_cast<>(&MetadataDictionary::policy, py::const_),
diff --git a/python/holoscan/core/network_context.cpp b/python/holoscan/core/network_context.cpp
index f0c5098f..480e6916 100644
--- a/python/holoscan/core/network_context.cpp
+++ b/python/holoscan/core/network_context.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,8 +28,6 @@
 #include "kwarg_handling.hpp"
 #include "network_context_pydoc.hpp"
 
-using pybind11::literals::operator""_a;
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -41,12 +39,12 @@ class PyNetworkContext : public NetworkContext {
 
   // Define a kwargs-based constructor that can create an ArgList
   // for passing on to the variadic-template based constructor.
-  PyNetworkContext(const py::args& args, const py::kwargs& kwargs) : NetworkContext() {
+  PyNetworkContext(const py::args& args, const py::kwargs& kwargs) {
     using std::string_literals::operator""s;
 
     int n_fragments = 0;
-    for (auto& item : args) {
-      py::object arg_value = item.cast<py::object>();
+    for (const auto& item : args) {
+      auto arg_value = item.cast<py::object>();
       if (py::isinstance<Fragment>(arg_value)) {
         if (n_fragments > 0) { throw std::runtime_error("multiple Fragment objects provided"); }
         fragment_ = arg_value.cast<Fragment*>();
@@ -56,8 +54,8 @@ class PyNetworkContext : public NetworkContext {
       }
     }
     for (const auto& [name, value] : kwargs) {
-      std::string kwarg_name = name.cast<std::string>();
-      py::object kwarg_value = value.cast<py::object>();
+      auto kwarg_name = name.cast<std::string>();
+      auto kwarg_value = value.cast<py::object>();
       if (kwarg_name == "name"s) {
         if (py::isinstance<py::str>(kwarg_value)) {
           name_ = kwarg_value.cast<std::string>();
@@ -99,7 +97,9 @@ void init_network_context(py::module_& m) {
       .def_property(
           "name",
           py::overload_cast<>(&NetworkContext::name, py::const_),
-          (NetworkContext & (NetworkContext::*)(const std::string&)&)&NetworkContext::name,
+          [](NetworkContext& c, const std::string& name) -> NetworkContext& {
+            return c.name(name);
+          },
           doc::NetworkContext::doc_name)
       .def_property_readonly("fragment",
                              py::overload_cast<>(&NetworkContext::fragment),
diff --git a/python/holoscan/core/operator.cpp b/python/holoscan/core/operator.cpp
index 1830ce25..5745d47f 100644
--- a/python/holoscan/core/operator.cpp
+++ b/python/holoscan/core/operator.cpp
@@ -10,7 +10,7 @@
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+4 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
@@ -40,8 +40,7 @@
 #include "kwarg_handling.hpp"
 #include "operator_pydoc.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -61,19 +60,19 @@ void init_operator(py::module_& m) {
           //       it.
           //       Otherwise, this method will return a new IOSpec object instead of a reference to
           //       the existing one.
-          [](OperatorSpec& op, const std::string& name, py::object size) -> IOSpec& {
+          [](OperatorSpec& op, const std::string& name, const py::object& size) -> IOSpec& {
             // Check if 'size' is an int and convert to IOSpec::IOSize if necessary
             if (py::isinstance<py::int_>(size)) {
-              int size_int = size.cast<int>();
+              auto size_int = size.cast<int>();
               // Assuming IOSpec::IOSize can be constructed from an int
               return op.input<gxf::Entity>(name, IOSpec::IOSize(size_int));
-            } else if (py::isinstance<IOSpec::IOSize>(size)) {
+            }
+            if (py::isinstance<IOSpec::IOSize>(size)) {
               // Directly pass IOSpec::IOSize if 'size' is already the correct type
               return op.input<gxf::Entity>(name, size.cast<IOSpec::IOSize>());
-            } else {
-              throw std::runtime_error(
-                  "Invalid type for 'size'. Expected 'int' or 'holoscan.core.IOSpec.IOSize'.");
             }
+            throw std::runtime_error(
+                "Invalid type for 'size'. Expected 'int' or 'holoscan.core.IOSpec.IOSize'.");
           },
           "name"_a,
           py::kw_only(),
@@ -132,10 +131,11 @@ void init_operator(py::module_& m) {
   operator_class
       .def(py::init<py::object, Fragment*, const py::args&, const py::kwargs&>(),
            doc::Operator::doc_Operator_args_kwargs)
-      .def_property("name",
-                    py::overload_cast<>(&Operator::name, py::const_),
-                    (Operator & (Operator::*)(const std::string&)) & Operator::name,
-                    doc::Operator::doc_name)
+      .def_property(
+          "name",
+          py::overload_cast<>(&Operator::name, py::const_),
+          [](Operator& op, const std::string& name) -> Operator& { return op.name(name); },
+          doc::Operator::doc_name)
       .def_property_readonly(
           "fragment", py::overload_cast<>(&Operator::fragment), doc::Operator::doc_fragment)
       .def_property("spec",
@@ -220,7 +220,7 @@ void init_operator(py::module_& m) {
       .value("NATIVE", Operator::OperatorType::kNative)
       .value("GXF", Operator::OperatorType::kGXF)
       .value("VIRTUAL", Operator::OperatorType::kVirtual);
-}
+}  // init_operator
 
 PyOperatorSpec::PyOperatorSpec(Fragment* fragment, py::object op)
     : OperatorSpec(fragment), py_op_(std::move(op)) {}
@@ -233,7 +233,7 @@ void PyOperatorSpec::py_param(const std::string& name, const py::object& default
   std::string headline{""s};
   std::string description{""s};
   for (const auto& [kw_name, value] : kwargs) {
-    std::string param_name = kw_name.cast<std::string>();
+    auto param_name = kw_name.cast<std::string>();
     if (param_name == "headline") {
       headline = value.cast<std::string>();
     } else if (param_name == "description") {
@@ -281,27 +281,25 @@ std::list<Parameter<std::vector<IOSpec*>>>& PyOperatorSpec::py_receivers() {
 }
 
 // PyOperator
-
-PyOperator::PyOperator(py::object op, Fragment* fragment, const py::args& args,
+PyOperator::PyOperator(const py::object& op, Fragment* fragment, const py::args& args,
                        const py::kwargs& kwargs)
-    : Operator() {
+    : py_op_(op),
+      py_compute_(py::getattr(op, "compute")),
+      py_initialize_(py::getattr(op, "initialize")),
+      py_start_(py::getattr(op, "start")),
+      py_stop_(py::getattr(op, "stop")) {
   using std::string_literals::operator""s;
 
   HOLOSCAN_LOG_TRACE("PyOperator::PyOperator()");
-  py_op_ = op;
-  py_compute_ = py::getattr(op, "compute");        // cache the compute method
-  py_initialize_ = py::getattr(op, "initialize");  // cache the initialize method
-  py_start_ = py::getattr(op, "start");            // cache the start method
-  py_stop_ = py::getattr(op, "stop");              // cache the stop method
   fragment_ = fragment;
 
   // Store the application object to access the trace/profile functions
-  auto app = fragment_->application();
-  py_app_ = static_cast<PyApplication*>(app);
+  auto* app = fragment_->application();
+  py_app_ = dynamic_cast<PyApplication*>(app);
 
   // Parse args
-  for (auto& item : args) {
-    py::object arg_value = item.cast<py::object>();
+  for (const auto& item : args) {
+    auto arg_value = item.cast<py::object>();
     if (py::isinstance<Condition>(arg_value)) {
       this->add_arg(arg_value.cast<std::shared_ptr<Condition>>());
     } else if (py::isinstance<Resource>(arg_value)) {
@@ -319,8 +317,8 @@ PyOperator::PyOperator(py::object op, Fragment* fragment, const py::args& args,
 
   // Pars kwargs
   for (const auto& [name, value] : kwargs) {
-    std::string kwarg_name = name.cast<std::string>();
-    py::object kwarg_value = value.cast<py::object>();
+    auto kwarg_name = name.cast<std::string>();
+    auto kwarg_value = value.cast<py::object>();
     if (kwarg_name == "name"s) {
       if (py::isinstance<py::str>(kwarg_value)) {
         this->name(kwarg_value.cast<std::string>());
@@ -328,21 +326,16 @@ PyOperator::PyOperator(py::object op, Fragment* fragment, const py::args& args,
         throw std::runtime_error("name kwarg must be a string");
       }
     } else if (kwarg_name == "fragment"s) {
-      if (py::isinstance<Fragment>(kwarg_value)) {
-        throw std::runtime_error(
-            "Cannot add kwarg fragment. Fragment can only be provided positionally");
-      } else {
-        throw std::runtime_error("fragment kwarg must be a Fragment");
-      }
+      throw std::runtime_error("fragment cannot be passed via a kwarg, only positionally");
     } else if (py::isinstance<Condition>(kwarg_value)) {
       // Set the condition's name to the kwarg name
       auto cond = kwarg_value.cast<std::shared_ptr<Condition>>();
-      cond.get()->name(kwarg_name);
+      cond->name(kwarg_name);
       this->add_arg(cond);
     } else if (py::isinstance<Resource>(kwarg_value)) {
       // Set the resource's name to the kwarg name
       auto resource = kwarg_value.cast<std::shared_ptr<Resource>>();
-      resource.get()->name(kwarg_name);
+      resource->name(kwarg_name);
       this->add_arg(resource);
     } else {
       this->add_arg(py_object_to_arg(kwarg_value, kwarg_name));
@@ -350,7 +343,7 @@ PyOperator::PyOperator(py::object op, Fragment* fragment, const py::args& args,
   }
 
   // Set name if needed
-  if (name_ == "") {
+  if (name_.empty()) {
     static size_t op_number;
     op_number++;
     this->name("unnamed_operator_" + std::to_string(op_number));
@@ -424,6 +417,7 @@ PyOperator::TracingThreadLocal& PyOperator::get_tracing_data() {
     // Check if the module name starts with '_pydevd_bundle' which means that it is using
     // PyDevd debugger. If so, then we need to set the trace function to the current frame.
     auto module_name = trace_module.cast<std::string>();
+    // NOLINTNEXTLINE(abseil-string-find-str-contains)
     if (module_name.find("_pydevd_bundle") != std::string::npos) {
       if (data.pydevd_trace_func.is_none()) {
         // Get the trace function from the debugger
@@ -474,7 +468,7 @@ void PyOperator::set_py_tracing() {
     // If tracing is not enabled, do nothing and return
     if (!tracing_data.in_tracing) { return; }
 
-    auto py_thread_state = _PyThreadState_UncheckedGet();
+    auto* py_thread_state = _PyThreadState_UncheckedGet();
 
     // If tracing_data.is_func_set is false, cache the current trace/profile functions for
     // the current thread.
@@ -541,9 +535,11 @@ void PyOperator::set_py_tracing() {
     // https://github.com/python/cpython/blob/c184c6750e40ca4ffa4f62a5d145b892cbd066bc
     //   /Doc/whatsnew/3.11.rst#L2301
     // - tstate->frame is removed.
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     py_thread_state->cframe->current_frame =
         reinterpret_cast<_PyInterpreterFrame*>(tracing_data.py_last_frame);
 #else  // < Python 3.11.0
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     py_thread_state->frame = reinterpret_cast<PyFrameObject*>(tracing_data.py_last_frame);
 #endif
 
@@ -620,7 +616,7 @@ void PyOperator::stop() {
 
 void PyOperator::compute(InputContext& op_input, OutputContext& op_output,
                          ExecutionContext& context) {
-  auto gxf_context = context.context();
+  auto* gxf_context = context.context();
 
   // Get the compute method of the Python Operator class and call it
   py::gil_scoped_acquire scope_guard;
diff --git a/python/holoscan/core/operator.hpp b/python/holoscan/core/operator.hpp
index e7b39761..894fcc28 100644
--- a/python/holoscan/core/operator.hpp
+++ b/python/holoscan/core/operator.hpp
@@ -87,7 +87,8 @@ class PyOperator : public Operator {
 
   // Define a kwargs-based constructor that can create an ArgList
   // for passing on to the variadic-template based constructor.
-  PyOperator(py::object op, Fragment* fragment, const py::args& args, const py::kwargs& kwargs);
+  PyOperator(const py::object& op, Fragment* fragment, const py::args& args,
+             const py::kwargs& kwargs);
 
   // Override spec() method
   std::shared_ptr<PyOperatorSpec> py_shared_spec();
@@ -150,11 +151,11 @@ class PyOperator : public Operator {
                ExecutionContext& context) override;
 
  private:
-  py::object py_op_ = py::none();
-  py::object py_initialize_ = py::none();
-  py::object py_start_ = py::none();
-  py::object py_stop_ = py::none();
-  py::object py_compute_ = py::none();
+  py::object py_op_ = py::none();          ///> cache the Python operator
+  py::object py_initialize_ = py::none();  ///> cache the initialize method
+  py::object py_start_ = py::none();       ///> cache the start method
+  py::object py_stop_ = py::none();        ///> cache the stop method
+  py::object py_compute_ = py::none();     ///> cache the compute method
 
   /// Python application pointer to access the trace/profile functions
   PyApplication* py_app_ = nullptr;
diff --git a/python/holoscan/core/resource.cpp b/python/holoscan/core/resource.cpp
index b453c31f..fdc91919 100644
--- a/python/holoscan/core/resource.cpp
+++ b/python/holoscan/core/resource.cpp
@@ -31,8 +31,6 @@
 #include "kwarg_handling.hpp"
 #include "resource_pydoc.hpp"
 
-using pybind11::literals::operator""_a;
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -46,15 +44,14 @@ class PyResource : public Resource {
   // for passing on to the variadic-template based constructor.
   PyResource(py::object resource, Fragment* fragment, const py::args& args,
              const py::kwargs& kwargs)
-      : Resource() {
+      : py_resource_(std::move(resource)) {
     using std::string_literals::operator""s;
 
-    py_resource_ = std::move(resource);
     fragment_ = fragment;
 
     int n_fragments = 0;
-    for (auto& item : args) {
-      py::object arg_value = item.cast<py::object>();
+    for (const auto& item : args) {
+      auto arg_value = item.cast<py::object>();
       if (py::isinstance<Fragment>(arg_value)) {
         if (n_fragments > 0) { throw std::runtime_error("multiple Fragment objects provided"); }
         fragment_ = arg_value.cast<Fragment*>();
@@ -64,8 +61,8 @@ class PyResource : public Resource {
       }
     }
     for (const auto& [name, value] : kwargs) {
-      std::string kwarg_name = name.cast<std::string>();
-      py::object kwarg_value = value.cast<py::object>();
+      auto kwarg_name = name.cast<std::string>();
+      auto kwarg_value = value.cast<py::object>();
       if (kwarg_name == "name"s) {
         if (py::isinstance<py::str>(kwarg_value)) {
           name_ = kwarg_value.cast<std::string>();
@@ -116,10 +113,11 @@ void init_resource(py::module_& m) {
   resource_class
       .def(py::init<py::object, Fragment*, const py::args&, const py::kwargs&>(),
            doc::Resource::doc_Resource_args_kwargs)
-      .def_property("name",
-                    py::overload_cast<>(&Resource::name, py::const_),
-                    (Resource & (Resource::*)(const std::string&)&)&Resource::name,
-                    doc::Resource::doc_name)
+      .def_property(
+          "name",
+          py::overload_cast<>(&Resource::name, py::const_),
+          [](Resource& r, const std::string& name) -> Resource& { return r.name(name); },
+          doc::Resource::doc_name)
       .def_property_readonly(
           "fragment", py::overload_cast<>(&Resource::fragment), doc::Resource::doc_fragment)
       .def_property("spec",
diff --git a/python/holoscan/core/scheduler.cpp b/python/holoscan/core/scheduler.cpp
index 45f20497..fced9921 100644
--- a/python/holoscan/core/scheduler.cpp
+++ b/python/holoscan/core/scheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,8 +28,6 @@
 #include "kwarg_handling.hpp"
 #include "scheduler_pydoc.hpp"
 
-using pybind11::literals::operator""_a;
-
 namespace py = pybind11;
 
 namespace holoscan {
@@ -41,12 +39,12 @@ class PyScheduler : public Scheduler {
 
   // Define a kwargs-based constructor that can create an ArgList
   // for passing on to the variadic-template based constructor.
-  PyScheduler(const py::args& args, const py::kwargs& kwargs) : Scheduler() {
+  PyScheduler(const py::args& args, const py::kwargs& kwargs) {
     using std::string_literals::operator""s;
 
     int n_fragments = 0;
-    for (auto& item : args) {
-      py::object arg_value = item.cast<py::object>();
+    for (const auto& item : args) {
+      auto arg_value = item.cast<py::object>();
       if (py::isinstance<Fragment>(arg_value)) {
         if (n_fragments > 0) { throw std::runtime_error("multiple Fragment objects provided"); }
         fragment_ = arg_value.cast<Fragment*>();
@@ -56,8 +54,8 @@ class PyScheduler : public Scheduler {
       }
     }
     for (const auto& [name, value] : kwargs) {
-      std::string kwarg_name = name.cast<std::string>();
-      py::object kwarg_value = value.cast<py::object>();
+      auto kwarg_name = name.cast<std::string>();
+      auto kwarg_value = value.cast<py::object>();
       if (kwarg_name == "name"s) {
         if (py::isinstance<py::str>(kwarg_value)) {
           name_ = kwarg_value.cast<std::string>();
@@ -96,10 +94,11 @@ void init_scheduler(py::module_& m) {
       m, "Scheduler", doc::Scheduler::doc_Scheduler)
       .def(py::init<const py::args&, const py::kwargs&>(),
            doc::Scheduler::doc_Scheduler_args_kwargs)
-      .def_property("name",
-                    py::overload_cast<>(&Scheduler::name, py::const_),
-                    (Scheduler & (Scheduler::*)(const std::string&)&)&Scheduler::name,
-                    doc::Scheduler::doc_name)
+      .def_property(
+          "name",
+          py::overload_cast<>(&Scheduler::name, py::const_),
+          [](Scheduler& s, const std::string& name) -> Scheduler& { return s.name(name); },
+          doc::Scheduler::doc_name)
       .def_property_readonly(
           "fragment", py::overload_cast<>(&Scheduler::fragment), doc::Scheduler::doc_fragment)
       .def_property("spec",
diff --git a/python/holoscan/core/tensor.cpp b/python/holoscan/core/tensor.cpp
index 5e60527f..36c2574f 100644
--- a/python/holoscan/core/tensor.cpp
+++ b/python/holoscan/core/tensor.cpp
@@ -35,15 +35,14 @@
 #include "tensor.hpp"
 #include "tensor_pydoc.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
 namespace {
 
-static constexpr const char* dlpack_capsule_name{"dltensor"};
-static constexpr const char* used_dlpack_capsule_name{"used_dltensor"};
+constexpr const char* dlpack_capsule_name{"dltensor"};
+constexpr const char* used_dlpack_capsule_name{"used_dltensor"};
 }  // namespace
 
 namespace holoscan {
@@ -89,6 +88,7 @@ void init_tensor(py::module_& m) {
       .def_property_readonly(
           "data",
           [](const Tensor& t) {
+            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
             return static_cast<int64_t>(reinterpret_cast<uintptr_t>(t.data()));
           },
           doc::Tensor::doc_data)
@@ -178,7 +178,10 @@ LazyDLManagedTensorDeleter::LazyDLManagedTensorDeleter() {
       // Register on_exit() to be called when the application exits.
       // Note that the child process will not call on_exit() when fork() is called and exit() is
       // called in the child process.
-      std::atexit(on_exit);
+      if (std::atexit(on_exit) != 0) {
+        HOLOSCAN_LOG_ERROR("Failed to register exit handler for LazyDLManagedTensorDeleter");
+        // std::exit(EXIT_FAILURE);
+      }
     }
 
     s_is_running = true;
@@ -234,10 +237,10 @@ void LazyDLManagedTensorDeleter::run() {
     lock.unlock();
     // Call the deleter function for each pointer in the queue
     while (!local_queue.empty()) {
-      auto dl_managed_tensor_ptr = local_queue.front();
+      auto* dl_managed_tensor_ptr = local_queue.front();
       // Note: the deleter function can be nullptr (e.g. when the tensor is created from
       // __cuda_array_interface__ protocol)
-      if (dl_managed_tensor_ptr && dl_managed_tensor_ptr->deleter != nullptr) {
+      if (dl_managed_tensor_ptr != nullptr && dl_managed_tensor_ptr->deleter != nullptr) {
         // Call the deleter function with GIL acquired
         py::gil_scoped_acquire scope_guard;
         dl_managed_tensor_ptr->deleter(dl_managed_tensor_ptr);
@@ -365,6 +368,7 @@ py::object PyTensor::from_dlpack_pyobj(const py::object& obj) {
   return py_tensor;
 }
 
+// NOLINTBEGIN(readability-function-cognitive-complexity)
 std::shared_ptr<PyTensor> PyTensor::from_array_interface(const py::object& obj, bool cuda) {
   auto memory_buf = std::make_shared<ArrayInterfaceMemoryBuffer>();
   memory_buf->obj_ref = obj;  // hold obj to prevent it from being garbage collected
@@ -392,7 +396,8 @@ std::shared_ptr<PyTensor> PyTensor::from_array_interface(const py::object& obj,
     }
   }
   auto data_array = array_interface["data"].cast<std::vector<int64_t>>();
-  auto data_ptr = reinterpret_cast<void*>(data_array[0]);
+  // NOLINTNEXTLINE(performance-no-int-to-ptr,cppcoreguidelines-pro-type-reinterpret-cast)
+  auto* data_ptr = reinterpret_cast<void*>(data_array[0]);
   // bool data_readonly = data_array[1] > 0;
   // auto version = array_interface["version"].cast<int64_t>();
 
@@ -446,16 +451,18 @@ std::shared_ptr<PyTensor> PyTensor::from_array_interface(const py::object& obj,
           "Invalid stream, valid stream should be  None (no synchronization), 1 (legacy default "
           "stream), 2 "
           "(per-thread defaultstream), or a positive integer (stream pointer)");
-    } else if (stream_id <= 2) {
+    }
+    if (stream_id <= 2) {
       stream_ptr = nullptr;
     } else {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr,cppcoreguidelines-pro-type-reinterpret-cast)
       stream_ptr = reinterpret_cast<cudaStream_t>(stream_id);
     }
 
     cudaStream_t curr_stream_ptr = nullptr;  // legacy stream
 
     if (stream_id >= 0 && curr_stream_ptr != stream_ptr) {
-      cudaEvent_t curr_stream_event;
+      cudaEvent_t curr_stream_event{};
       HOLOSCAN_CUDA_CALL_THROW_ERROR(
           cudaEventCreateWithFlags(&curr_stream_event, cudaEventDisableTiming),
           "Failure during call to cudaEventCreateWithFlags");
@@ -470,18 +477,20 @@ std::shared_ptr<PyTensor> PyTensor::from_array_interface(const py::object& obj,
     }
   }
   // Create DLManagedTensor object
-  auto dl_managed_tensor_ctx = new DLManagedTensorContext;
+  // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
+  auto* dl_managed_tensor_ctx = new DLManagedTensorContext;
   auto& dl_managed_tensor = dl_managed_tensor_ctx->tensor;
 
   dl_managed_tensor_ctx->memory_ref = memory_buf;
 
   dl_managed_tensor.manager_ctx = dl_managed_tensor_ctx;
   dl_managed_tensor.deleter = [](DLManagedTensor* self) {
-    auto dl_managed_tensor_ctx = static_cast<DLManagedTensorContext*>(self->manager_ctx);
+    auto* dl_managed_tensor_ctx = static_cast<DLManagedTensorContext*>(self->manager_ctx);
     // Note: since 'memory_ref' is maintaining python object reference, we should acquire GIL in
     // case this function is called from another non-python thread, before releasing 'memory_ref'.
     py::gil_scoped_acquire scope_guard;
     dl_managed_tensor_ctx->memory_ref.reset();
+    // NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
     delete dl_managed_tensor_ctx;
   };
 
@@ -494,6 +503,7 @@ std::shared_ptr<PyTensor> PyTensor::from_array_interface(const py::object& obj,
 
   return tensor;
 }
+// NOLINTEND(readability-function-cognitive-complexity)
 
 std::shared_ptr<PyTensor> PyTensor::from_dlpack(const py::object& obj) {
   // Pybind11 doesn't have a way to get/set a pointer with a name so we have to use the C API
@@ -507,7 +517,7 @@ std::shared_ptr<PyTensor> PyTensor::from_dlpack(const py::object& obj) {
   auto dlpack_device = py::cast<py::tuple>(dlpack_device_func());
   // https://dmlc.github.io/dlpack/latest/c_api.html#_CPPv48DLDevice
   DLDeviceType device_type = static_cast<DLDeviceType>(dlpack_device[0].cast<int>());
-  int32_t device_id = dlpack_device[1].cast<int32_t>();
+  auto device_id = dlpack_device[1].cast<int32_t>();
 
   DLDevice device = {device_type, device_id};
 
@@ -551,7 +561,7 @@ std::shared_ptr<PyTensor> PyTensor::from_dlpack(const py::object& obj) {
 
   PyObject* dlpack_capsule_ptr = dlpack_obj.ptr();
 
-  if (!PyCapsule_IsValid(dlpack_capsule_ptr, dlpack_capsule_name)) {
+  if (PyCapsule_IsValid(dlpack_capsule_ptr, dlpack_capsule_name) == 0) {
     const char* capsule_name = PyCapsule_GetName(dlpack_capsule_ptr);
     throw std::runtime_error(
         fmt::format("Received an invalid DLPack capsule ('{}'). You might have already consumed "
@@ -559,7 +569,7 @@ std::shared_ptr<PyTensor> PyTensor::from_dlpack(const py::object& obj) {
                     capsule_name));
   }
 
-  DLManagedTensor* dl_managed_tensor =
+  auto* dl_managed_tensor =
       static_cast<DLManagedTensor*>(PyCapsule_GetPointer(dlpack_capsule_ptr, dlpack_capsule_name));
 
   // Set device
@@ -596,7 +606,7 @@ py::tuple PyTensor::dlpack_device(const py::object& obj) {
   return py_dlpack_device(tensor.get());
 }
 
-bool is_tensor_like(py::object value) {
+bool is_tensor_like(const py::object& value) {
   return ((py::hasattr(value, "__dlpack__") && py::hasattr(value, "__dlpack_device__")) ||
           py::isinstance<holoscan::PyTensor>(value) ||
           py::hasattr(value, "__cuda_array_interface__") ||
diff --git a/python/holoscan/core/tensor.hpp b/python/holoscan/core/tensor.hpp
index 4f4c84d9..8308e139 100644
--- a/python/holoscan/core/tensor.hpp
+++ b/python/holoscan/core/tensor.hpp
@@ -182,7 +182,7 @@ class PyTensor : public Tensor {
   static py::tuple dlpack_device(const py::object& obj);
 };
 
-bool is_tensor_like(py::object value);
+bool is_tensor_like(const py::object& value);
 
 }  // namespace holoscan
 
diff --git a/python/holoscan/decorator.py b/python/holoscan/decorator.py
index d183ee61..3ccc951b 100644
--- a/python/holoscan/decorator.py
+++ b/python/holoscan/decorator.py
@@ -28,7 +28,7 @@
 import inspect
 import textwrap
 from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Optional, Union
 
 import cupy as cp
 import numpy as np
@@ -91,9 +91,9 @@ class Input:
     name: str
     arg_map: Optional[Union[str, dict[str, str]]] = ()
     condition_type: Optional[ConditionType] = None
-    condition_kwargs: Dict[str, Any] = field(default_factory=dict)
+    condition_kwargs: dict[str, Any] = field(default_factory=dict)
     connector_type: Optional[IOSpec.ConnectorType] = None
-    connector_kwargs: Dict[str, Any] = field(default_factory=dict)
+    connector_kwargs: dict[str, Any] = field(default_factory=dict)
 
     def create_input(self, spec: OperatorSpec) -> IOSpec:
         iospec = spec.input(self.name)
@@ -130,11 +130,11 @@ class Output:
     """
 
     name: str
-    tensor_names: Optional[Union[str, Tuple[str]]] = ()
+    tensor_names: Optional[Union[str, tuple[str]]] = ()
     condition_type: Optional[ConditionType] = None
-    condition_kwargs: Dict[str, Any] = field(default_factory=dict)
+    condition_kwargs: dict[str, Any] = field(default_factory=dict)
     connector_type: Optional[IOSpec.ConnectorType] = None
-    connector_kwargs: Dict[str, Any] = field(default_factory=dict)
+    connector_kwargs: dict[str, Any] = field(default_factory=dict)
 
     def create_output(self, spec: OperatorSpec) -> IOSpec:
         iospec = spec.output(self.name)
@@ -210,8 +210,8 @@ def visit(self, node):
 
 def create_op(
     function_or_class=None,
-    inputs: Union[str, Input, Tuple[Union[str, Input]]] = (),
-    outputs: Union[str, Output, Tuple[Union[str, Output]]] = (),
+    inputs: Union[str, Input, tuple[Union[str, Input]]] = (),
+    outputs: Union[str, Output, tuple[Union[str, Output]]] = (),
     cast_tensors=True,
 ):
     """Decorator for creating an operator from a function or a class.
@@ -488,7 +488,7 @@ def compute(self, op_input, op_output, context):
                         out = self.func(*self.func_args, **self.fixed_kwargs, **self.dynamic_kwargs)
 
                     # if the output is a tuple and there is >1 port, we distribute the outputs
-                    if isinstance(out, Tuple) and (len(self.output_tensor_map) > 1):
+                    if isinstance(out, tuple) and (len(self.output_tensor_map) > 1):
                         # for tuple case, each port should correspond to each output tuple element
                         if any([len(names) > 1 for names in self.output_tensor_map.values()]):
                             raise ValueError(
diff --git a/python/holoscan/executors/executors.cpp b/python/holoscan/executors/executors.cpp
index 97db35dd..aa6749b5 100644
--- a/python/holoscan/executors/executors.cpp
+++ b/python/holoscan/executors/executors.cpp
@@ -25,11 +25,7 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/graph.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
diff --git a/python/holoscan/graphs/graphs.cpp b/python/holoscan/graphs/graphs.cpp
index 5ed0a8ef..55a294b7 100644
--- a/python/holoscan/graphs/graphs.cpp
+++ b/python/holoscan/graphs/graphs.cpp
@@ -35,16 +35,15 @@
 #include "holoscan/core/operator.hpp"
 #include "holoscan/core/operator_spec.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
-namespace py = pybind11;
+namespace py = pybind11;  // NOLINT(misc-unused-alias-decls)
 
+// NOLINTNEXTLINE(modernize-concat-nested-namespaces)
 namespace PYBIND11_NAMESPACE {
 namespace detail {
 
+// NOLINTBEGIN(altera-struct-pack-align)
 template <typename NodeT>
 struct graph_caster {
  public:
@@ -65,7 +64,7 @@ struct graph_caster {
    * std::vector<NodeT> instance or return false upon failure. The
    * second argument indicates whether implicit conversions should be applied.
    */
-  bool load(handle src, bool) {
+  bool load([[maybe_unused]] handle src, [[maybe_unused]] bool use_implicit) {
     // not implemented
     return false;
   }
@@ -83,7 +82,7 @@ struct graph_caster {
     for (auto&& value : src) {
       auto value_ =
           reinterpret_steal<object>(value_conv::cast(std::forward<NodeT>(value), policy, parent));
-      if (!value_) { return handle(); }
+      if (!value_) { return {}; }
       PyList_SET_ITEM(out.ptr(), index++, value_.release().ptr());  // steals a reference
     }
     return out.release();
@@ -97,6 +96,7 @@ class type_caster<std::vector<::holoscan::OperatorGraph::NodeType>>
 template <>
 class type_caster<std::vector<::holoscan::FragmentGraph::NodeType>>
     : public graph_caster<::holoscan::FragmentGraph::NodeType> {};
+// NOLINTEND(altera-struct-pack-align)
 
 }  // namespace detail
 }  // namespace PYBIND11_NAMESPACE
@@ -164,6 +164,7 @@ PYBIND11_MODULE(_graphs, m) {
         .. currentmodule:: _graphs
     )pbdoc";
 
+  // NOLINTBEGIN(bugprone-unused-raii)
   py::class_<OperatorGraph::NodeType>(m, "OperatorNodeType");
   py::class_<OperatorGraph::EdgeDataElementType>(m, "OperatorEdgeDataElementType");
   py::class_<OperatorGraph::EdgeDataType>(m, "OperatorEdgeDataType");
@@ -176,6 +177,7 @@ PYBIND11_MODULE(_graphs, m) {
   // py::class_<FragmentGraph::EdgeDataType>(m, "FragmentEdgeDataType");
   py::class_<FragmentGraph, PyFragmentGraph, std::shared_ptr<FragmentGraph>>(
       m, "FragmentGraph", doc::Graph::doc_Graph);
+  // NOLINTEND(bugprone-unused-raii)
 
   py::class_<OperatorFlowGraph, OperatorGraph, std::shared_ptr<OperatorFlowGraph>>(
       m, "OperatorFlowGraph", doc::FlowGraph::doc_FlowGraph)
diff --git a/python/holoscan/gxf/entity.cpp b/python/holoscan/gxf/entity.cpp
index ed792fbc..181c76fc 100644
--- a/python/holoscan/gxf/entity.cpp
+++ b/python/holoscan/gxf/entity.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,7 +28,7 @@
 #include "holoscan/core/domain/tensor.hpp"
 #include "holoscan/core/gxf/entity.hpp"
 
-using pybind11::literals::operator""_a;
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
diff --git a/python/holoscan/gxf/gxf.cpp b/python/holoscan/gxf/gxf.cpp
index d5bf063b..965f80f1 100644
--- a/python/holoscan/gxf/gxf.cpp
+++ b/python/holoscan/gxf/gxf.cpp
@@ -39,11 +39,7 @@
 
 #include "gxf/core/gxf.h"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using pybind11::literals::operator""_a;  // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -68,13 +64,14 @@ PYBIND11_MODULE(_gxf, m) {
 
   init_entity(m);
 
-  // TODO: This method can be removed once Executor::extension_manager(),
+  // TODO(unknown): `load_extensions` can be removed once Executor::extension_manager(),
   // ExtensionManager, GXFExtensionManager are exposed to Python.
   m.def(
       "load_extensions",
       [](uint64_t context,
          const std::vector<std::string>& extension_filenames,
          const std::vector<std::string>& manifest_filenames) {
+        // NOLINTNEXTLINE(performance-no-int-to-ptr,cppcoreguidelines-pro-type-reinterpret-cast)
         gxf::GXFExtensionManager extension_manager(reinterpret_cast<gxf_context_t>(context));
         for (const auto& extension_filename : extension_filenames) {
           extension_manager.load_extension(extension_filename);
diff --git a/python/holoscan/logger/logger.cpp b/python/holoscan/logger/logger.cpp
index 705a0a4a..2ac6784a 100644
--- a/python/holoscan/logger/logger.cpp
+++ b/python/holoscan/logger/logger.cpp
@@ -22,9 +22,6 @@
 #include "holoscan/logger/logger.hpp"
 #include "logger_pydoc.hpp"
 
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
 namespace py = pybind11;
 
 namespace holoscan {
diff --git a/python/holoscan/network_contexts/network_contexts.cpp b/python/holoscan/network_contexts/network_contexts.cpp
index 4645d2c6..fa8af86b 100644
--- a/python/holoscan/network_contexts/network_contexts.cpp
+++ b/python/holoscan/network_contexts/network_contexts.cpp
@@ -29,11 +29,8 @@
 #include "holoscan/core/network_contexts/gxf/ucx_context.hpp"
 #include "holoscan/core/resources/gxf/ucx_entity_serializer.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -56,14 +53,14 @@ class PyUcxContext : public UcxContext {
   using UcxContext::UcxContext;
 
   // Define a constructor that fully initializes the object.
-  PyUcxContext(Fragment* fragment, std::shared_ptr<UcxEntitySerializer> serializer = nullptr,
-               const std::string& name = "ucx_context")
-      : UcxContext() {
+  explicit PyUcxContext(Fragment* fragment,
+                        std::shared_ptr<UcxEntitySerializer> serializer = nullptr,
+                        const std::string& name = "ucx_context") {
     if (serializer) { this->add_arg(Arg{"serializer", serializer}); }
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 // End of trampoline classes for handling Python kwargs
diff --git a/python/holoscan/operators/aja_source/aja_source.cpp b/python/holoscan/operators/aja_source/aja_source.cpp
index 534531ca..40726d69 100644
--- a/python/holoscan/operators/aja_source/aja_source.cpp
+++ b/python/holoscan/operators/aja_source/aja_source.cpp
@@ -18,10 +18,13 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <array>
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <unordered_map>
+#include <utility>
 #include <variant>
 
 #include "../operator_util.hpp"
@@ -32,11 +35,8 @@
 #include "holoscan/core/operator_spec.hpp"
 #include "holoscan/operators/aja_source/aja_source.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -44,23 +44,23 @@ namespace holoscan::ops {
 
 namespace {
 
-static std::unordered_map<std::string, NTV2Channel> const NTV2ChannelMapping = {
-    {"NTV2_CHANNEL1", NTV2Channel::NTV2_CHANNEL1},
-    {"NTV2_CHANNEL2", NTV2Channel::NTV2_CHANNEL2},
-    {"NTV2_CHANNEL3", NTV2Channel::NTV2_CHANNEL3},
-    {"NTV2_CHANNEL4", NTV2Channel::NTV2_CHANNEL4},
-    {"NTV2_CHANNEL5", NTV2Channel::NTV2_CHANNEL5},
-    {"NTV2_CHANNEL6", NTV2Channel::NTV2_CHANNEL6},
-    {"NTV2_CHANNEL7", NTV2Channel::NTV2_CHANNEL7},
-    {"NTV2_CHANNEL8", NTV2Channel::NTV2_CHANNEL8}};
-
-static NTV2Channel ToNTV2Channel(const std::string& value) {
-  auto it = NTV2ChannelMapping.find(value);
-  if (it != NTV2ChannelMapping.end()) {
-    return it->second;
-  } else {
-    return NTV2Channel::NTV2_CHANNEL_INVALID;
+// using constexpr constructor instead of unordered_map here to make clang-tidy happy
+// (avoids warning of type: fuchsia-statically-constructed-objects)
+constexpr std::array<std::pair<std::string_view, NTV2Channel>, 8> NTV2ChannelMapping = {
+    {{"NTV2_CHANNEL1", NTV2Channel::NTV2_CHANNEL1},
+     {"NTV2_CHANNEL2", NTV2Channel::NTV2_CHANNEL2},
+     {"NTV2_CHANNEL3", NTV2Channel::NTV2_CHANNEL3},
+     {"NTV2_CHANNEL4", NTV2Channel::NTV2_CHANNEL4},
+     {"NTV2_CHANNEL5", NTV2Channel::NTV2_CHANNEL5},
+     {"NTV2_CHANNEL6", NTV2Channel::NTV2_CHANNEL6},
+     {"NTV2_CHANNEL7", NTV2Channel::NTV2_CHANNEL7},
+     {"NTV2_CHANNEL8", NTV2Channel::NTV2_CHANNEL8}}};
+
+constexpr NTV2Channel ToNTV2Channel(std::string_view value) {
+  for (const auto& [name, channel] : NTV2ChannelMapping) {
+    if (name == value) { return channel; }
   }
+  return NTV2Channel::NTV2_CHANNEL_INVALID;
 }
 
 }  // namespace
@@ -83,10 +83,10 @@ class PyAJASourceOp : public AJASourceOp {
   // Define a constructor that fully initializes the object.
   PyAJASourceOp(
       Fragment* fragment, const py::args& args, const std::string& device = "0"s,
-      const std::variant<std::string, NTV2Channel> channel = NTV2Channel::NTV2_CHANNEL1,
+      const std::variant<std::string, NTV2Channel>& channel = NTV2Channel::NTV2_CHANNEL1,
       uint32_t width = 1920, uint32_t height = 1080, uint32_t framerate = 60,
       bool interlaced = false, bool rdma = false, bool enable_overlay = false,
-      const std::variant<std::string, NTV2Channel> overlay_channel = NTV2Channel::NTV2_CHANNEL2,
+      const std::variant<std::string, NTV2Channel>& overlay_channel = NTV2Channel::NTV2_CHANNEL2,
       bool overlay_rdma = true, const std::string& name = "aja_source")
       : AJASourceOp(ArgList{Arg{"device", device},
                             Arg{"width", width},
@@ -110,7 +110,7 @@ class PyAJASourceOp : public AJASourceOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/bayer_demosaic/bayer_demosaic.cpp b/python/holoscan/operators/bayer_demosaic/bayer_demosaic.cpp
index 0f702d2d..3b320582 100644
--- a/python/holoscan/operators/bayer_demosaic/bayer_demosaic.cpp
+++ b/python/holoscan/operators/bayer_demosaic/bayer_demosaic.cpp
@@ -30,11 +30,8 @@
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 #include "holoscan/operators/bayer_demosaic/bayer_demosaic.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -74,7 +71,7 @@ class PyBayerDemosaicOp : public BayerDemosaicOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/format_converter/format_converter.cpp b/python/holoscan/operators/format_converter/format_converter.cpp
index d3d6b61a..43196460 100644
--- a/python/holoscan/operators/format_converter/format_converter.cpp
+++ b/python/holoscan/operators/format_converter/format_converter.cpp
@@ -33,11 +33,8 @@
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 #include "holoscan/operators/format_converter/format_converter.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -62,10 +59,10 @@ class PyFormatConverterOp : public FormatConverterOp {
   PyFormatConverterOp(Fragment* fragment, const py::args& args,
                       std::shared_ptr<holoscan::Allocator> pool, const std::string& out_dtype,
                       const std::string& in_dtype = "", const std::string& in_tensor_name = "",
-                      const std::string& out_tensor_name = "", float scale_min = 0.f,
-                      float scale_max = 1.f, uint8_t alpha_value = static_cast<uint8_t>(255),
+                      const std::string& out_tensor_name = "", float scale_min = 0.F,
+                      float scale_max = 1.F, uint8_t alpha_value = static_cast<uint8_t>(255),
                       int32_t resize_height = 0, int32_t resize_width = 0, int32_t resize_mode = 0,
-                      const std::vector<int> out_channel_order = std::vector<int>{},
+                      const std::vector<int>& out_channel_order = std::vector<int>{},
                       std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool = nullptr,
                       const std::string& name = "format_converter")
       : FormatConverterOp(ArgList{Arg{"in_tensor_name", in_tensor_name},
@@ -85,7 +82,7 @@ class PyFormatConverterOp : public FormatConverterOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -122,8 +119,8 @@ PYBIND11_MODULE(_format_converter, m) {
            "in_dtype"_a = ""s,
            "in_tensor_name"_a = ""s,
            "out_tensor_name"_a = ""s,
-           "scale_min"_a = 0.f,
-           "scale_max"_a = 1.f,
+           "scale_min"_a = 0.F,
+           "scale_max"_a = 1.F,
            "alpha_value"_a = static_cast<uint8_t>(255),
            "resize_height"_a = 0,
            "resize_width"_a = 0,
diff --git a/python/holoscan/operators/gxf_codelet/gxf_codelet.cpp b/python/holoscan/operators/gxf_codelet/gxf_codelet.cpp
index fafd5533..6a2ff841 100644
--- a/python/holoscan/operators/gxf_codelet/gxf_codelet.cpp
+++ b/python/holoscan/operators/gxf_codelet/gxf_codelet.cpp
@@ -30,11 +30,8 @@
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 #include "holoscan/operators/gxf_codelet/gxf_codelet.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -56,12 +53,11 @@ class PyGXFCodeletOp : public GXFCodeletOp {
   using GXFCodeletOp::GXFCodeletOp;
 
   // Define a constructor that fully initializes the object.
-  PyGXFCodeletOp(py::object op, Fragment* fragment, const std::string& gxf_typename,
+  PyGXFCodeletOp(const py::object& op, Fragment* fragment, const std::string& gxf_typename,
                  const py::args& args, const std::string& name, const py::kwargs& kwargs)
-      : GXFCodeletOp(gxf_typename.c_str()) {
-    py_op_ = op;
-    py_initialize_ = py::getattr(op, "initialize");  // cache the initialize method
-
+      : GXFCodeletOp(gxf_typename.c_str()),
+        py_op_(op),
+        py_initialize_(py::getattr(op, "initialize")) {
     add_positional_condition_and_resource_args(this, args);
     add_kwargs(this, kwargs);
 
@@ -84,7 +80,7 @@ class PyGXFCodeletOp : public GXFCodeletOp {
     GXFCodeletOp::initialize();
   }
 
- protected:
+ private:
   py::object py_op_ = py::none();
   py::object py_initialize_ = py::none();
 };
diff --git a/python/holoscan/operators/holoviz/__init__.py b/python/holoscan/operators/holoviz/__init__.py
index 8cf552d8..3e578ac9 100644
--- a/python/holoscan/operators/holoviz/__init__.py
+++ b/python/holoscan/operators/holoviz/__init__.py
@@ -172,7 +172,7 @@ def __init__(
             if isinstance(receiver, str):
                 continue  # skip
                 # raise NotImpelementedError(
-                #     "TODO: need to enable access to self.spec for the OperatorSpec"  # noqa: FIX002
+                #     "TODO(unknown): need to enable access to self.spec for the OperatorSpec"  # noqa: FIX002
                 # )
                 # receiver = IOSpec(
                 #     op_spec=self.spec,
diff --git a/python/holoscan/operators/holoviz/holoviz.cpp b/python/holoscan/operators/holoviz/holoviz.cpp
index 890bb008..fabf091d 100644
--- a/python/holoscan/operators/holoviz/holoviz.cpp
+++ b/python/holoscan/operators/holoviz/holoviz.cpp
@@ -29,7 +29,7 @@
 #include <vector>
 
 // the default range for enums is 128 which is not enough for the Key enum, increase to 512
-#define MAGIC_ENUM_RANGE_MAX 512
+#define MAGIC_ENUM_RANGE_MAX 512  // NOLINT(cppcoreguidelines-macro-usage)
 #include <magic_enum.hpp>
 
 #include "../operator_util.hpp"
@@ -50,11 +50,8 @@
 #include "holoscan/operators/holoviz/codecs.hpp"
 #include "holoscan/operators/holoviz/holoviz.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -100,16 +97,17 @@ class PyHolovizOp : public HolovizOp {
       std::vector<holoscan::IOSpec*> receivers = std::vector<holoscan::IOSpec*>(),
       const std::vector<HolovizOp::InputSpec>& tensors = std::vector<HolovizOp::InputSpec>(),
       const std::vector<std::vector<float>>& color_lut = std::vector<std::vector<float>>(),
-      const std::string& window_title = "Holoviz", const std::string& display_name = "",
-      uint32_t width = 1920, uint32_t height = 1080, float framerate = 60.f,
+      const std::string& window_title = "Holoviz"s, const std::string& display_name = ""s,
+      uint32_t width = 1920, uint32_t height = 1080, float framerate = 60.F,
       bool use_exclusive_display = false, bool fullscreen = false, bool headless = false,
       bool framebuffer_srgb = false, bool vsync = false,
       ColorSpace display_color_space = ColorSpace::AUTO, bool enable_render_buffer_input = false,
       bool enable_render_buffer_output = false, bool enable_camera_pose_output = false,
-      const std::string& camera_pose_output_type = "projection_matrix",
-      const std::array<float, 3>& camera_eye = {0.f, 0.f, 1.f},
-      const std::array<float, 3>& camera_look_at = {0.f, 0.f, 0.f},
-      const std::array<float, 3>& camera_up = {0.f, 1.f, 1.f},
+      const std::string& camera_pose_output_type = "projection_matrix"s,
+      const std::array<float, 3>& camera_eye = {0.F, 0.F, 1.F},
+      const std::array<float, 3>& camera_look_at = {0.F, 0.F, 0.F},
+      const std::array<float, 3>& camera_up = {0.F, 1.F, 1.F},
+      // NOLINTBEGIN(performance-unnecessary-value-param)
       KeyCallbackFunction key_callback = KeyCallbackFunction(),
       UnicodeCharCallbackFunction unicode_char_callback = UnicodeCharCallbackFunction(),
       MouseButtonCallbackFunction mouse_button_callback = MouseButtonCallbackFunction(),
@@ -117,7 +115,8 @@ class PyHolovizOp : public HolovizOp {
       CursorPosCallbackFunction cursor_pos_callback = CursorPosCallbackFunction(),
       FramebufferSizeCallbackFunction framebuffer_size_callback = FramebufferSizeCallbackFunction(),
       WindowSizeCallbackFunction window_size_callback = WindowSizeCallbackFunction(),
-      const std::string& font_path = "",
+      // NOLINTEND(performance-unnecessary-value-param)
+      const std::string& font_path = ""s,
       std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool = nullptr,
       const std::string& name = "holoviz_op")
       : HolovizOp(ArgList{Arg{"allocator", allocator},
@@ -143,8 +142,8 @@ class PyHolovizOp : public HolovizOp {
                           Arg{"font_path", font_path}}) {
     // only append tensors argument if it is not empty
     //     avoids [holoscan] [error] [gxf_operator.hpp:126] Unable to handle parameter 'tensors'
-    if (tensors.size() > 0) { this->add_arg(Arg{"tensors", tensors}); }
-    if (receivers.size() > 0) { this->add_arg(Arg{"receivers", receivers}); }
+    if (!tensors.empty()) { this->add_arg(Arg{"tensors", tensors}); }
+    if (!receivers.empty()) { this->add_arg(Arg{"receivers", receivers}); }
     if (cuda_stream_pool) { this->add_arg(Arg{"cuda_stream_pool", cuda_stream_pool}); }
 
     // check if callbacks are provided, for each callback take the GIL before calling the function
@@ -190,18 +189,17 @@ class PyHolovizOp : public HolovizOp {
               })});
     }
     if (framebuffer_size_callback) {
-      this->add_arg(
-          Arg{"framebuffer_size_callback",
-              FramebufferSizeCallbackFunction([framebuffer_size_callback](int width, int height) {
-                py::gil_scoped_acquire guard;
-                framebuffer_size_callback(width, height);
-              })});
+      this->add_arg(Arg{"framebuffer_size_callback",
+                        FramebufferSizeCallbackFunction([framebuffer_size_callback](int w, int h) {
+                          py::gil_scoped_acquire guard;
+                          framebuffer_size_callback(w, h);
+                        })});
     }
     if (window_size_callback) {
       this->add_arg(Arg{"window_size_callback",
-                        WindowSizeCallbackFunction([window_size_callback](int width, int height) {
+                        WindowSizeCallbackFunction([window_size_callback](int w, int h) {
                           py::gil_scoped_acquire guard;
-                          window_size_callback(width, height);
+                          window_size_callback(w, h);
                         })});
     }
 
@@ -209,7 +207,7 @@ class PyHolovizOp : public HolovizOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -287,10 +285,10 @@ PYBIND11_MODULE(_holoviz, m) {
                  "enable_render_buffer_input"_a = false,
                  "enable_render_buffer_output"_a = false,
                  "enable_camera_pose_output"_a = false,
-                 "camera_pose_output_type"_a = "projection_matrix",
-                 "camera_eye"_a = std::array<float, 3>{0.f, 0.f, 1.f},
-                 "camera_look_at"_a = std::array<float, 3>{0.f, 0.f, 0.f},
-                 "camera_up"_a = std::array<float, 3>{0.f, 1.f, 1.f},
+                 "camera_pose_output_type"_a = "projection_matrix"s,
+                 "camera_eye"_a = std::array<float, 3>{0.F, 0.F, 1.F},
+                 "camera_look_at"_a = std::array<float, 3>{0.F, 0.F, 0.F},
+                 "camera_up"_a = std::array<float, 3>{0.F, 1.F, 1.F},
                  "key_callback"_a = HolovizOp::KeyCallbackFunction(),
                  "unicode_char_callback"_a = HolovizOp::UnicodeCharCallbackFunction(),
                  "mouse_button_callback"_a = HolovizOp::MouseButtonCallbackFunction(),
@@ -298,7 +296,7 @@ PYBIND11_MODULE(_holoviz, m) {
                  "cursor_pos_callback"_a = HolovizOp::CursorPosCallbackFunction(),
                  "framebuffer_size_callback"_a = HolovizOp::FramebufferSizeCallbackFunction(),
                  "window_size_callback"_a = HolovizOp::WindowSizeCallbackFunction(),
-                 "font_path"_a = "",
+                 "font_path"_a = ""s,
                  "cuda_stream_pool"_a = py::none(),
                  "name"_a = "holoviz_op"s,
                  doc::HolovizOp::doc_HolovizOp);
diff --git a/python/holoscan/operators/holoviz/pydoc.hpp b/python/holoscan/operators/holoviz/pydoc.hpp
index 0d6c81e2..4629155a 100644
--- a/python/holoscan/operators/holoviz/pydoc.hpp
+++ b/python/holoscan/operators/holoviz/pydoc.hpp
@@ -260,7 +260,7 @@ The details of the dictionary is as follows:
     `depth_map_color`. (default: `auto_detect`).
   - type: ``str``
 
-- **color**: RGBA color of rendered geometry (default: ``[1.f, 1.f, 1.f, 1.f]``)
+- **color**: RGBA color of rendered geometry (default: ``[1.F, 1.F, 1.F, 1.F]``)
 
   - type: ``List[float]``
 - **line_width**: line width for geometry made of lines (default: ``1.0``)
diff --git a/python/holoscan/operators/inference/inference.cpp b/python/holoscan/operators/inference/inference.cpp
index 9da11388..23d292a5 100644
--- a/python/holoscan/operators/inference/inference.cpp
+++ b/python/holoscan/operators/inference/inference.cpp
@@ -32,27 +32,24 @@
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 #include "holoscan/operators/inference/inference.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
 namespace holoscan::ops {
 
-InferenceOp::DataMap _dict_to_inference_datamap(py::dict dict) {
+InferenceOp::DataMap _dict_to_inference_datamap(const py::dict& dict) {
   InferenceOp::DataMap data_map;
-  for (auto& [key, value] : dict) {
+  for (const auto& [key, value] : dict) {
     data_map.insert(key.cast<std::string>(), value.cast<std::string>());
   }
   return data_map;
 }
 
-InferenceOp::DataVecMap _dict_to_inference_datavecmap(py::dict dict) {
+InferenceOp::DataVecMap _dict_to_inference_datavecmap(const py::dict& dict) {
   InferenceOp::DataVecMap data_vec_map;
-  for (auto& [key, value] : dict) {
+  for (const auto& [key, value] : dict) {
     data_vec_map.insert(key.cast<std::string>(), value.cast<std::vector<std::string>>());
   }
   return data_vec_map;
@@ -76,15 +73,16 @@ class PyInferenceOp : public InferenceOp {
   // Define a constructor that fully initializes the object.
   PyInferenceOp(Fragment* fragment, const py::args& args, const std::string& backend,
                 std::shared_ptr<::holoscan::Allocator> allocator,
-                py::dict inference_map,      // InferenceOp::DataVecMap
-                py::dict model_path_map,     // InferenceOp::DataMap
-                py::dict pre_processor_map,  // InferenceOp::DataVecMap
-                py::dict device_map,         // InferenceOp::DataMap
-                py::dict temporal_map,       // InferenceOp::DataMap
-                py::dict activation_map,     // InferenceOp::DataMap
-                py::dict backend_map,        // InferenceOp::DataMap
+                const py::dict& inference_map,      // InferenceOp::DataVecMap
+                const py::dict& model_path_map,     // InferenceOp::DataMap
+                const py::dict& pre_processor_map,  // InferenceOp::DataVecMap
+                const py::dict& device_map,         // InferenceOp::DataMap
+                const py::dict& temporal_map,       // InferenceOp::DataMap
+                const py::dict& activation_map,     // InferenceOp::DataMap
+                const py::dict& backend_map,        // InferenceOp::DataMap
                 const std::vector<std::string>& in_tensor_names,
-                const std::vector<std::string>& out_tensor_names, bool infer_on_cpu = false,
+                const std::vector<std::string>& out_tensor_names,
+                const std::vector<int32_t>& trt_opt_profile, bool infer_on_cpu = false,
                 bool parallel_inference = true, bool input_on_cuda = true,
                 bool output_on_cuda = true, bool transmit_on_cuda = true, bool enable_fp16 = false,
                 bool is_engine_path = false,
@@ -96,6 +94,7 @@ class PyInferenceOp : public InferenceOp {
                             Arg{"allocator", allocator},
                             Arg{"in_tensor_names", in_tensor_names},
                             Arg{"out_tensor_names", out_tensor_names},
+                            Arg{"trt_opt_profile", trt_opt_profile},
                             Arg{"infer_on_cpu", infer_on_cpu},
                             Arg{"parallel_inference", parallel_inference},
                             Arg{"input_on_cuda", input_on_cuda},
@@ -110,8 +109,8 @@ class PyInferenceOp : public InferenceOp {
 
     // Workaround to maintain backwards compatibility with the v0.5 API:
     // convert any single str values to List[str].
-    py::dict inference_map_dict = inference_map.cast<py::dict>();
-    for (auto& [key, value] : inference_map_dict) {
+    auto inference_map_dict = inference_map.cast<py::dict>();
+    for (const auto& [key, value] : inference_map_dict) {
       if (py::isinstance<py::str>(value)) {
         // warn about deprecated non-list input
         auto key_str = key.cast<std::string>();
@@ -130,18 +129,18 @@ class PyInferenceOp : public InferenceOp {
       }
     }
 
-    py::dict temporal_map_infer = temporal_map.cast<py::dict>();
-    for (auto& [key, value] : temporal_map_infer) {
+    auto temporal_map_infer = temporal_map.cast<py::dict>();
+    for (const auto& [key, value] : temporal_map_infer) {
       if (!py::isinstance<py::str>(value)) { temporal_map_infer[key] = py::str(value); }
     }
 
-    py::dict activation_map_infer = activation_map.cast<py::dict>();
-    for (auto& [key, value] : activation_map_infer) {
+    auto activation_map_infer = activation_map.cast<py::dict>();
+    for (const auto& [key, value] : activation_map_infer) {
       if (!py::isinstance<py::str>(value)) { activation_map_infer[key] = py::str(value); }
     }
 
-    py::dict device_map_infer = device_map.cast<py::dict>();
-    for (auto& [key, value] : device_map_infer) {
+    auto device_map_infer = device_map.cast<py::dict>();
+    for (const auto& [key, value] : device_map_infer) {
       if (!py::isinstance<py::str>(value)) { device_map_infer[key] = py::str(value); }
     }
 
@@ -169,7 +168,7 @@ class PyInferenceOp : public InferenceOp {
     this->add_arg(Arg("pre_processor_map", pre_processor_datamap));
 
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -198,6 +197,7 @@ PYBIND11_MODULE(_inference, m) {
                             py::dict,
                             const std::vector<std::string>&,
                             const std::vector<std::string>&,
+                            const std::vector<int32_t>&,
                             bool,
                             bool,
                             bool,
@@ -219,6 +219,7 @@ PYBIND11_MODULE(_inference, m) {
                    "backend_map"_a = py::dict(),
                    "in_tensor_names"_a = std::vector<std::string>{},
                    "out_tensor_names"_a = std::vector<std::string>{},
+                   "trt_opt_profile"_a = std::vector<int32_t>{1, 1, 1},
                    "infer_on_cpu"_a = false,
                    "parallel_inference"_a = true,
                    "input_on_cuda"_a = true,
diff --git a/python/holoscan/operators/inference/pydoc.hpp b/python/holoscan/operators/inference/pydoc.hpp
index 3c9a1df2..50dee22e 100644
--- a/python/holoscan/operators/inference/pydoc.hpp
+++ b/python/holoscan/operators/inference/pydoc.hpp
@@ -81,6 +81,8 @@ in_tensor_names : sequence of str, optional
     Input tensors.
 out_tensor_names : sequence of str, optional
     Output tensors.
+trt_opt_profile : sequence of int, optional
+    TensorRT optimization profile for models with dynamic inputs.
 infer_on_cpu : bool, optional
     Whether to run the computation on the CPU instead of GPU. Default value is ``False``.
 parallel_inference : bool, optional
diff --git a/python/holoscan/operators/inference_processor/inference_processor.cpp b/python/holoscan/operators/inference_processor/inference_processor.cpp
index 1a3c3954..f90f593d 100644
--- a/python/holoscan/operators/inference_processor/inference_processor.cpp
+++ b/python/holoscan/operators/inference_processor/inference_processor.cpp
@@ -32,27 +32,24 @@
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 #include "holoscan/operators/inference_processor/inference_processor.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
 namespace holoscan::ops {
 
-InferenceProcessorOp::DataMap _dict_to_processor_datamap(py::dict dict) {
+InferenceProcessorOp::DataMap _dict_to_processor_datamap(const py::dict& dict) {
   InferenceProcessorOp::DataMap data_map;
-  for (auto& [key, value] : dict) {
+  for (const auto& [key, value] : dict) {
     data_map.insert(key.cast<std::string>(), value.cast<std::string>());
   }
   return data_map;
 }
 
-InferenceProcessorOp::DataVecMap _dict_to_processor_datavecmap(py::dict dict) {
+InferenceProcessorOp::DataVecMap _dict_to_processor_datavecmap(const py::dict& dict) {
   InferenceProcessorOp::DataVecMap data_vec_map;
-  for (auto& [key, value] : dict) {
+  for (const auto& [key, value] : dict) {
     data_vec_map.insert(key.cast<std::string>(), value.cast<std::vector<std::string>>());
   }
   return data_vec_map;
@@ -76,15 +73,15 @@ class PyInferenceProcessorOp : public InferenceProcessorOp {
   // Define a constructor that fully initializes the object.
   PyInferenceProcessorOp(Fragment* fragment, const py::args& args,
                          std::shared_ptr<::holoscan::Allocator> allocator,
-                         py::dict process_operations,  // InferenceProcessorOp::DataVecMap
-                         py::dict processed_map,       // InferenceProcessorOp::DataVecMap
+                         const py::dict& process_operations,  // InferenceProcessorOp::DataVecMap
+                         const py::dict& processed_map,       // InferenceProcessorOp::DataVecMap
                          const std::vector<std::string>& in_tensor_names,
                          const std::vector<std::string>& out_tensor_names,
                          bool input_on_cuda = false, bool output_on_cuda = false,
                          bool transmit_on_cuda = false, bool disable_transmitter = false,
                          std::shared_ptr<holoscan::CudaStreamPool> cuda_stream_pool = nullptr,
-                         const std::string& config_path = std::string(""),
-                         const std::string& name = "postprocessor")
+                         const std::string& config_path = ""s,
+                         const std::string& name = "postprocessor"s)
       : InferenceProcessorOp(ArgList{Arg{"allocator", allocator},
                                      Arg{"in_tensor_names", in_tensor_names},
                                      Arg{"out_tensor_names", out_tensor_names},
@@ -105,8 +102,8 @@ class PyInferenceProcessorOp : public InferenceProcessorOp {
 
     // Workaround to maintain backwards compatibility with the v0.5 API:
     // convert any single str values to List[str].
-    py::dict processed_map_dict = processed_map.cast<py::dict>();
-    for (auto& [key, value] : processed_map_dict) {
+    auto processed_map_dict = processed_map.cast<py::dict>();
+    for (const auto& [key, value] : processed_map_dict) {
       if (py::isinstance<py::str>(value)) {
         // warn about deprecated non-list input
         auto key_str = key.cast<std::string>();
@@ -131,7 +128,7 @@ class PyInferenceProcessorOp : public InferenceProcessorOp {
 
     spec_ = std::make_shared<OperatorSpec>(fragment);
 
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/ping_tensor_rx/ping_tensor_rx.cpp b/python/holoscan/operators/ping_tensor_rx/ping_tensor_rx.cpp
index bc1090e6..d5a1cc5c 100644
--- a/python/holoscan/operators/ping_tensor_rx/ping_tensor_rx.cpp
+++ b/python/holoscan/operators/ping_tensor_rx/ping_tensor_rx.cpp
@@ -28,11 +28,8 @@
 #include "holoscan/core/operator_spec.hpp"
 #include "holoscan/operators/ping_tensor_rx/ping_tensor_rx.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -54,13 +51,12 @@ class PyPingTensorRxOp : public holoscan::ops::PingTensorRxOp {
 
   // Define a constructor that fully initializes the object.
   PyPingTensorRxOp(Fragment* fragment, const py::args& args,
-                   const std::string& name = "ping_tensor_rx")
-      : PingTensorRxOp() {
+                   const std::string& name = "ping_tensor_rx") {
     add_positional_condition_and_resource_args(this, args);
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/ping_tensor_tx/ping_tensor_tx.cpp b/python/holoscan/operators/ping_tensor_tx/ping_tensor_tx.cpp
index b3412f9b..7b4c5920 100644
--- a/python/holoscan/operators/ping_tensor_tx/ping_tensor_tx.cpp
+++ b/python/holoscan/operators/ping_tensor_tx/ping_tensor_tx.cpp
@@ -34,11 +34,8 @@
 #include "holoscan/operators/ping_tensor_tx/ping_tensor_tx.hpp"
 #include "holoscan/core/resources/gxf/allocator.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -65,7 +62,7 @@ class PyPingTensorTxOp : public holoscan::ops::PingTensorTxOp {
                    std::optional<int32_t> batch_size = std::nullopt, int32_t rows = 32,
                    std::optional<int32_t> columns = 64,
                    std::optional<int32_t> channels = std::nullopt,
-                   const std::variant<std::string, py::dtype> dtype = "uint8_t",
+                   const std::variant<std::string, py::dtype>& dtype = "uint8_t",
                    const std::string& tensor_name = "tensor",
                    const std::string& name = "ping_tensor_tx")
       : PingTensorTxOp(ArgList{Arg{"storage_type", storage_type},
@@ -81,15 +78,12 @@ class PyPingTensorTxOp : public holoscan::ops::PingTensorTxOp {
     } else {
       auto dt = std::get<py::dtype>(dtype);
       std::string data_type;
-      std::string dtype_name = dt.attr("name").cast<std::string>();
-      if (dtype_name == "float16") {  // currently promoting float16 scalars to float
-        data_type = "float";
-      } else if (dtype_name == "float32") {
+      auto dtype_name = dt.attr("name").cast<std::string>();
+      if (dtype_name == "float16" || dtype_name == "float32") {
+        // currently promoting float16 scalars to float
         data_type = "float";
       } else if (dtype_name == "float64") {
         data_type = "double";
-      } else if (dtype_name == "bool") {
-        data_type = "uint8_t";
       } else if (dtype_name == "int8") {
         data_type = "int8_t";
       } else if (dtype_name == "int16") {
@@ -98,7 +92,7 @@ class PyPingTensorTxOp : public holoscan::ops::PingTensorTxOp {
         data_type = "int32_t";
       } else if (dtype_name == "int64") {
         data_type = "int64_t";
-      } else if (dtype_name == "uint8") {
+      } else if (dtype_name == "bool" || dtype_name == "uint8") {
         data_type = "uint8_t";
       } else if (dtype_name == "uint16") {
         data_type = "uint16_t";
@@ -118,7 +112,7 @@ class PyPingTensorTxOp : public holoscan::ops::PingTensorTxOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/segmentation_postprocessor/segmentation_postprocessor.cpp b/python/holoscan/operators/segmentation_postprocessor/segmentation_postprocessor.cpp
index e6f28546..3ad184bd 100644
--- a/python/holoscan/operators/segmentation_postprocessor/segmentation_postprocessor.cpp
+++ b/python/holoscan/operators/segmentation_postprocessor/segmentation_postprocessor.cpp
@@ -30,11 +30,8 @@
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
 #include "holoscan/operators/segmentation_postprocessor/segmentation_postprocessor.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -71,7 +68,7 @@ class PySegmentationPostprocessorOp : public SegmentationPostprocessorOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/v4l2_video_capture/v4l2_video_capture.cpp b/python/holoscan/operators/v4l2_video_capture/v4l2_video_capture.cpp
index ab498971..38d4e7cb 100644
--- a/python/holoscan/operators/v4l2_video_capture/v4l2_video_capture.cpp
+++ b/python/holoscan/operators/v4l2_video_capture/v4l2_video_capture.cpp
@@ -33,11 +33,8 @@
 #include "holoscan/operators/v4l2_video_capture/v4l2_video_capture.hpp"
 // #include "holoscan/core/gxf/gxf_operator.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -63,8 +60,7 @@ class PyV4L2VideoCaptureOp : public V4L2VideoCaptureOp {
                        std::shared_ptr<::holoscan::Allocator> allocator,
                        const std::string& device = "/dev/video0"s, uint32_t width = 0,
                        uint32_t height = 0, uint32_t num_buffers = 4,
-                       const std::string& pixel_format = "auto",
-                       bool pass_through = false,
+                       const std::string& pixel_format = "auto", bool pass_through = false,
                        const std::string& name = "v4l2_video_capture",
                        std::optional<uint32_t> exposure_time = std::nullopt,
                        std::optional<uint32_t> gain = std::nullopt)
@@ -81,7 +77,7 @@ class PyV4L2VideoCaptureOp : public V4L2VideoCaptureOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/video_stream_recorder/video_stream_recorder.cpp b/python/holoscan/operators/video_stream_recorder/video_stream_recorder.cpp
index 120f9de7..32eba03d 100644
--- a/python/holoscan/operators/video_stream_recorder/video_stream_recorder.cpp
+++ b/python/holoscan/operators/video_stream_recorder/video_stream_recorder.cpp
@@ -28,11 +28,8 @@
 #include "holoscan/core/operator_spec.hpp"
 #include "holoscan/operators/video_stream_recorder/video_stream_recorder.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -64,7 +61,7 @@ class PyVideoStreamRecorderOp : public VideoStreamRecorderOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/operators/video_stream_replayer/pydoc.hpp b/python/holoscan/operators/video_stream_replayer/pydoc.hpp
index 56d21efb..c4f4ac4e 100644
--- a/python/holoscan/operators/video_stream_replayer/pydoc.hpp
+++ b/python/holoscan/operators/video_stream_replayer/pydoc.hpp
@@ -35,6 +35,22 @@ Operator class to replay a video stream from a file.
         file being read, this tensor could be on either CPU or GPU. For the data used in examples
         distributed with the SDK, the tensor will be an unnamed GPU tensor (name == "").
 
+**==Device Memory Requirements==**
+
+    This operator reads data from a file to an intermediate host buffer and then transfers the data
+    to the GPU. Because both host and device memory is needed, an allocator supporting both memory
+    types must be used. Options for this are `UnboundedAllocator` and `RMMAllocator`. When
+    specifying memory pool sizes for `RMMAllocator`, the following memory blocks are needed:
+
+     1. One block of host memory equal in size to a single uncompressed video frame
+       is needed. Note that for RMMAllocator, the memory sizes should be specified in MiB, so the
+       minimum value can be obtained by:
+       ``math.ceil(height * width * channels * element_size_bytes) / (1024 * 1024))``.
+     2. One block of device memory equal in size to the host memory block.
+
+    When declaring an `RMMAllocator` memory pool, `host_memory_initial_size` and
+    `device_memory_initial_size` must be greater than or equal to the values discussed above.
+
 Parameters
 ----------
 fragment : holoscan.core.Fragment (constructor only)
diff --git a/python/holoscan/operators/video_stream_replayer/video_stream_replayer.cpp b/python/holoscan/operators/video_stream_replayer/video_stream_replayer.cpp
index 6f9c8153..0b4b7d41 100644
--- a/python/holoscan/operators/video_stream_replayer/video_stream_replayer.cpp
+++ b/python/holoscan/operators/video_stream_replayer/video_stream_replayer.cpp
@@ -30,11 +30,8 @@
 #include "holoscan/core/resources/gxf/allocator.hpp"
 #include "holoscan/operators/video_stream_replayer/video_stream_replayer.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
-
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -59,7 +56,7 @@ class PyVideoStreamReplayerOp : public VideoStreamReplayerOp {
   PyVideoStreamReplayerOp(
       Fragment* fragment, const py::args& args, const std::string& directory,
       const std::string& basename, size_t batch_size = 1UL, bool ignore_corrupted_entities = true,
-      float frame_rate = 0.f, bool realtime = true, bool repeat = false, uint64_t count = 0UL,
+      float frame_rate = 0.F, bool realtime = true, bool repeat = false, uint64_t count = 0UL,
       std::optional<std::shared_ptr<holoscan::Allocator>> allocator = std::nullopt,
       std::optional<std::shared_ptr<holoscan::Resource>> entity_serializer = std::nullopt,
       const std::string& name = "video_stream_replayer")
@@ -79,7 +76,7 @@ class PyVideoStreamReplayerOp : public VideoStreamReplayerOp {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<OperatorSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -115,7 +112,7 @@ PYBIND11_MODULE(_video_stream_replayer, m) {
            "basename"_a,
            "batch_size"_a = 1UL,
            "ignore_corrupted_entities"_a = true,
-           "frame_rate"_a = 1.f,
+           "frame_rate"_a = 1.F,
            "realtime"_a = true,
            "repeat"_a = false,
            "count"_a = 0UL,
diff --git a/python/holoscan/resources/__init__.py b/python/holoscan/resources/__init__.py
index 137e5cde..256a9120 100644
--- a/python/holoscan/resources/__init__.py
+++ b/python/holoscan/resources/__init__.py
@@ -19,6 +19,7 @@
     holoscan.resources.Allocator
     holoscan.resources.BlockMemoryPool
     holoscan.resources.Clock
+    holoscan.resources.CudaAllocator
     holoscan.resources.CudaStreamPool
     holoscan.resources.DoubleBufferReceiver
     holoscan.resources.DoubleBufferTransmitter
@@ -27,9 +28,11 @@
     holoscan.resources.MemoryStorageType
     holoscan.resources.RealtimeClock
     holoscan.resources.Receiver
+    holoscan.resources.RMMAllocator
     holoscan.resources.SerializationBuffer
     holoscan.resources.StdComponentSerializer
     holoscan.resources.StdEntitySerializer
+    holoscan.resources.StreamOrderedAllocator
     holoscan.resources.Transmitter
     holoscan.resources.UnboundedAllocator
     holoscan.resources.UcxComponentSerializer
@@ -46,6 +49,7 @@
     Allocator,
     BlockMemoryPool,
     Clock,
+    CudaAllocator,
     CudaStreamPool,
     DoubleBufferReceiver,
     DoubleBufferTransmitter,
@@ -53,9 +57,11 @@
     MemoryStorageType,
     RealtimeClock,
     Receiver,
+    RMMAllocator,
     SerializationBuffer,
     StdComponentSerializer,
     StdEntitySerializer,
+    StreamOrderedAllocator,
     Transmitter,
     UcxComponentSerializer,
     UcxEntitySerializer,
@@ -73,6 +79,7 @@
     "Allocator",
     "BlockMemoryPool",
     "Clock",
+    "CudaAllocator",
     "CudaStreamPool",
     "DoubleBufferReceiver",
     "DoubleBufferTransmitter",
@@ -81,9 +88,11 @@
     "MemoryStorageType",
     "RealtimeClock",
     "Receiver",
+    "RMMAllocator",
     "SerializationBuffer",
     "StdComponentSerializer",
     "StdEntitySerializer",
+    "StreamOrderedAllocator",
     "Transmitter",
     "UcxComponentSerializer",
     "UcxEntitySerializer",
diff --git a/python/holoscan/resources/allocators.cpp b/python/holoscan/resources/allocators.cpp
index 867d8378..d93afec9 100644
--- a/python/holoscan/resources/allocators.cpp
+++ b/python/holoscan/resources/allocators.cpp
@@ -27,16 +27,33 @@
 #include "holoscan/core/gxf/gxf_resource.hpp"
 #include "holoscan/core/resources/gxf/allocator.hpp"
 #include "holoscan/core/resources/gxf/block_memory_pool.hpp"
+#include "holoscan/core/resources/gxf/cuda_allocator.hpp"
 #include "holoscan/core/resources/gxf/cuda_stream_pool.hpp"
+#include "holoscan/core/resources/gxf/rmm_allocator.hpp"
+#include "holoscan/core/resources/gxf/stream_ordered_allocator.hpp"
 #include "holoscan/core/resources/gxf/unbounded_allocator.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
 namespace holoscan {
 
+namespace {
+// constants copied from rmm_allocator.cpp
+// kPoolInitialSize, kPoolMaxSize copied from rmm_allocator.cpp
+#ifdef __aarch64__
+constexpr const char* kPoolInitialSize = "8MB";  // 8 MB initial pool size
+constexpr const char* kPoolMaxSize = "16MB";
+#else
+constexpr const char* kPoolInitialSize = "16MB";  // 16 MB initial pool size
+constexpr const char* kPoolMaxSize = "32MB";
+#endif
+constexpr const char* kReleaseThreshold = "4MB";  // 4MB release threshold
+
+}  // namespace
+
 /* Trampoline classes for handling Python kwargs
  *
  * These add a constructor that takes a Fragment for which to initialize the resource.
@@ -63,7 +80,21 @@ class PyBlockMemoryPool : public BlockMemoryPool {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
+  }
+};
+
+class PyUnboundedAllocator : public UnboundedAllocator {
+ public:
+  /* Inherit the constructors */
+  using UnboundedAllocator::UnboundedAllocator;
+
+  // Define a constructor that fully initializes the object.
+  explicit PyUnboundedAllocator(Fragment* fragment, const std::string& name = "cuda_stream_pool") {
+    name_ = name;
+    fragment_ = fragment;
+    spec_ = std::make_shared<ComponentSpec>(fragment);
+    setup(*spec_);
   }
 };
 
@@ -73,9 +104,9 @@ class PyCudaStreamPool : public CudaStreamPool {
   using CudaStreamPool::CudaStreamPool;
 
   // Define a constructor that fully initializes the object.
-  PyCudaStreamPool(Fragment* fragment, int32_t dev_id = 0, uint32_t stream_flags = 0,
-                   int32_t stream_priority = 0, uint32_t reserved_size = 1, uint32_t max_size = 0,
-                   const std::string& name = "cuda_stream_pool")
+  explicit PyCudaStreamPool(Fragment* fragment, int32_t dev_id = 0, uint32_t stream_flags = 0,
+                            int32_t stream_priority = 0, uint32_t reserved_size = 1,
+                            uint32_t max_size = 0, const std::string& name = "cuda_stream_pool")
       : CudaStreamPool(ArgList{
             Arg{"dev_id", dev_id},
             Arg{"stream_flags", stream_flags},
@@ -86,22 +117,56 @@ class PyCudaStreamPool : public CudaStreamPool {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
-class PyUnboundedAllocator : public UnboundedAllocator {
+class PyRMMAllocator : public RMMAllocator {
  public:
   /* Inherit the constructors */
-  using UnboundedAllocator::UnboundedAllocator;
+  using RMMAllocator::RMMAllocator;
 
   // Define a constructor that fully initializes the object.
-  explicit PyUnboundedAllocator(Fragment* fragment, const std::string& name = "cuda_stream_pool")
-      : UnboundedAllocator() {
+  explicit PyRMMAllocator(
+      Fragment* fragment,
+      const std::string& device_memory_initial_size = std::string(kPoolInitialSize),
+      const std::string& device_memory_max_size = std::string(kPoolMaxSize),
+      const std::string& host_memory_initial_size = std::string(kPoolInitialSize),
+      const std::string& host_memory_max_size = std::string(kPoolMaxSize), int32_t dev_id = 0,
+      const std::string& name = "rmm_pool")
+      : RMMAllocator(ArgList{Arg{"device_memory_initial_size", device_memory_initial_size},
+                             Arg{"device_memory_max_size", device_memory_max_size},
+                             Arg{"host_memory_initial_size", host_memory_initial_size},
+                             Arg{"host_memory_max_size", host_memory_max_size},
+                             Arg{"dev_id", dev_id}}) {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
+  }
+};
+
+class PyStreamOrderedAllocator : public StreamOrderedAllocator {
+ public:
+  /* Inherit the constructors */
+  using StreamOrderedAllocator::StreamOrderedAllocator;
+
+  // Define a constructor that fully initializes the object.
+  explicit PyStreamOrderedAllocator(
+      Fragment* fragment,
+      const std::string& device_memory_initial_size = std::string(kPoolInitialSize),
+      const std::string& device_memory_max_size = std::string(kPoolMaxSize),
+      const std::string& release_threshold = std::string(kReleaseThreshold), int32_t dev_id = 0,
+      const std::string& name = "stream_ordered_allocator")
+      : StreamOrderedAllocator(
+            ArgList{Arg{"device_memory_initial_size", device_memory_initial_size},
+                    Arg{"device_memory_max_size", device_memory_max_size},
+                    Arg{"release_threshold", release_threshold},
+                    Arg{"dev_id", dev_id}}) {
+    name_ = name;
+    fragment_ = fragment;
+    spec_ = std::make_shared<ComponentSpec>(fragment);
+    setup(*spec_);
   }
 };
 
@@ -137,10 +202,10 @@ void init_allocators(py::module_& m) {
           py::init<Fragment*, int32_t, uint32_t, int32_t, uint32_t, uint32_t, const std::string&>(),
           "fragment"_a,
           "dev_id"_a = 0,
-          "stream_flags"_a = 0u,
+          "stream_flags"_a = 0U,
           "stream_priority"_a = 0,
-          "reserved_size"_a = 1u,
-          "max_size"_a = 0u,
+          "reserved_size"_a = 1U,
+          "max_size"_a = 0U,
           "name"_a = "cuda_stream_pool"s,
           doc::CudaStreamPool::doc_CudaStreamPool);
 
@@ -153,5 +218,59 @@ void init_allocators(py::module_& m) {
            "fragment"_a,
            "name"_a = "unbounded_allocator"s,
            doc::UnboundedAllocator::doc_UnboundedAllocator);
+
+  py::class_<CudaAllocator, Allocator, std::shared_ptr<CudaAllocator>>(
+      m, "CudaAllocator", doc::CudaAllocator::doc_CudaAllocator)
+      .def(py::init<>(), doc::CudaAllocator::doc_CudaAllocator)
+      // Haven't wrapped cudaStream_t yet from Python
+      // .def("allocate_async",
+      //      &CudaAllocator::allocate_async,
+      //      "size"_a,
+      //      "stream"_a,
+      //      doc::CudaAllocator::doc_allocate_async)
+      // .def("free_async",
+      //      &CudaAllocator::free_async,
+      //      "pointer"_a,
+      //      "stream"_a,
+      //      doc::CudaAllocator::doc_free_async)
+      .def_property_readonly(
+          "pool_size", &CudaAllocator::pool_size, doc::CudaAllocator::doc_pool_size);
+
+  py::class_<RMMAllocator, PyRMMAllocator, CudaAllocator, std::shared_ptr<RMMAllocator>>(
+      m, "RMMAllocator", doc::RMMAllocator::doc_RMMAllocator)
+      .def(py::init<Fragment*,
+                    const std::string&,
+                    const std::string&,
+                    const std::string&,
+                    const std::string&,
+                    int32_t,
+                    const std::string&>(),
+           "fragment"_a,
+           "device_memory_initial_size"_a = std::string(kPoolInitialSize),
+           "device_memory_max_size"_a = std::string(kPoolMaxSize),
+           "host_memory_initial_size"_a = std::string(kPoolInitialSize),
+           "host_memory_max_size"_a = std::string(kPoolMaxSize),
+           "dev_id"_a = 0,
+           "name"_a = "rmm_pool",
+           doc::RMMAllocator::doc_RMMAllocator);
+
+  py::class_<StreamOrderedAllocator,
+             PyStreamOrderedAllocator,
+             CudaAllocator,
+             std::shared_ptr<StreamOrderedAllocator>>(
+      m, "StreamOrderedAllocator", doc::StreamOrderedAllocator::doc_StreamOrderedAllocator)
+      .def(py::init<Fragment*,
+                    const std::string&,
+                    const std::string&,
+                    const std::string&,
+                    int32_t,
+                    const std::string&>(),
+           "fragment"_a,
+           "device_memory_initial_size"_a = std::string(kPoolInitialSize),
+           "device_memory_max_size"_a = std::string(kPoolMaxSize),
+           "release_threshold"_a = std::string(kReleaseThreshold),
+           "dev_id"_a = 0,
+           "name"_a = "stream_ordered_allocator",
+           doc::StreamOrderedAllocator::doc_StreamOrderedAllocator);
 }
 }  // namespace holoscan
diff --git a/python/holoscan/resources/allocators_pydoc.hpp b/python/holoscan/resources/allocators_pydoc.hpp
index 07265102..ae41b34b 100644
--- a/python/holoscan/resources/allocators_pydoc.hpp
+++ b/python/holoscan/resources/allocators_pydoc.hpp
@@ -75,6 +75,64 @@ pointer : PyCapsule
 
 }  // namespace Allocator
 
+namespace CudaAllocator {
+
+PYDOC(CudaAllocator, R"doc(
+Base class for CUDA-based allocators.
+)doc")
+
+PYDOC(allocate_async, R"doc(
+Allocate amount(s) of memory asynchronously.
+
+Parameters
+----------
+size : int
+    The amount of memory to allocate
+stream : holoscan.CudaStream
+    The CUDA stream to use for the allocation.
+
+Returns
+-------
+Opaque PyCapsule object representing a std::byte* pointer to the allocated memory.
+)doc")
+
+PYDOC(free_async, R"doc(
+Free CUDA-based memory asynchronously.
+
+Parameters
+----------
+pointer : PyCapsule
+    Opaque PyCapsule object representing a std::byte* pointer to the allocated
+    memory.
+stream : holoscan.CudaStream
+    The CUDA stream to use for the allocation.
+)doc")
+
+PYDOC(block_size, R"doc(
+Get the block size of the allocator.
+
+Returns
+-------
+int
+    The block size of the allocator. Returns 1 for byte-based allocators.
+)doc")
+
+PYDOC(pool_size, R"doc(
+Return the memory pool size for the specified storage type.
+
+Parameters
+----------
+storage_type : holoscan.resources.MemoryStorageType
+    Enum representing the type of memory to allocate.
+
+Returns
+-------
+size : int
+    The size of the memory pool for the specified storage type.
+)doc")
+
+}  // namespace CudaAllocator
+
 namespace BlockMemoryPool {
 
 PYDOC(BlockMemoryPool, R"doc(
@@ -152,6 +210,78 @@ name : str, optional
 
 }  // namespace UnboundedAllocator
 
+namespace RMMAllocator {
+
+PYDOC(RMMAllocator, R"doc(
+Device and Host allocator using RAPIDS memory manager (RMM).
+
+Provides memory pools for asynchronously allocated CUDA device memory and pinned host memory.
+
+Parameters
+----------
+fragment : holoscan.core.Fragment
+    The fragment to assign the resource to.
+device_memory_initial_size : str, optional
+    The initial size of the device memory pool. See the Notes section for the format accepted.
+device_memory_max_size : str, optional
+    The maximum size of the device memory pool. See the Notes section for the format accepted.
+host_memory_initial_size : str, optional
+    The initial size of the host memory pool. See the Notes section for the format accepted.
+host_memory_max_size : str, optional
+    The maximum size of the host memory pool. See the Notes section for the format accepted.
+dev_id : int
+    GPU device ID. Specifies the device on which to create the memory pool.
+name : str, optional
+    The name of the memory pool.
+
+Notes
+-----
+The values for the memory parameters, such as `device_memory_initial_size` must be specified in the
+form of a string containing a non-negative integer value followed by a suffix representing the
+units. Supported units are B, KB, MB, GB and TB where the values are powers of 1024 bytes
+(e.g. MB = 1024 * 1024 bytes). Examples of valid units are "512MB", "256 KB", "1 GB". If a floating
+point number is specified that decimal portion will be truncated (i.e. the value is rounded down to
+the nearest integer).
+
+)doc")
+
+}  // namespace RMMAllocator
+
+namespace StreamOrderedAllocator {
+
+PYDOC(StreamOrderedAllocator, R"doc(
+Device and Host allocator using RAPIDS memory manager (StreamOrdered).
+
+Provides memory pools for asynchronously allocated CUDA device memory and pinned host memory.
+
+Parameters
+----------
+fragment : holoscan.core.Fragment
+    The fragment to assign the resource to.
+device_memory_initial_size : str, optional
+    The initial size of the device memory pool. See the Notes section for the format accepted.
+device_memory_max_size : str, optional
+    The maximum size of the device memory pool. See the Notes section for the format accepted.
+release_threshold : str, optional
+    The amount of reserved memory to hold onto before trying to release memory back to the OS.  See
+    the Notes section for the format accepted.
+dev_id : int, optional
+    GPU device ID. Specifies the device on which to create the memory pool.
+name : str, optional
+    The name of the memory pool.
+
+Notes
+-----
+The values for the memory parameters, such as `device_memory_initial_size` must be specified in the
+form of a string containing a non-negative integer value followed by a suffix representing the
+units. Supported units are B, KB, MB, GB and TB where the values are powers of 1024 bytes
+(e.g. MB = 1024 * 1024 bytes). Examples of valid units are "512MB", "256 KB", "1 GB". If a floating
+point number is specified that decimal portion will be truncated (i.e. the value is rounded down to
+the nearest integer).
+)doc")
+
+}  // namespace StreamOrderedAllocator
+
 }  // namespace holoscan::doc
 
 #endif  // PYHOLOSCAN_RESOURCES_ALLOCATORS_PYDOC_HPP
diff --git a/python/holoscan/resources/clocks.cpp b/python/holoscan/resources/clocks.cpp
index 9a7ee3be..0d0f5cb9 100644
--- a/python/holoscan/resources/clocks.cpp
+++ b/python/holoscan/resources/clocks.cpp
@@ -30,41 +30,36 @@
 #include "holoscan/core/resources/gxf/manual_clock.hpp"
 #include "holoscan/core/resources/gxf/realtime_clock.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
 namespace holoscan {
 
 int64_t get_duration_ns(const py::object& duration) {
-  if (py::isinstance<py::int_>(duration)) {
-    return py::cast<int64_t>(duration);
-  } else {
-    // Must acquire GIL before calling C API functions like PyDelta_Check
-    py::gil_scoped_acquire scope_guard;
-
-    // Must initialize PyDateTime_IMPORT here in order to be able to use PyDelta_Check below
-    // see: https://docs.python.org/3/c-api/datetime.html?highlight=pydelta_check#datetime-objects
-    if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
-
-    if (PyDelta_Check(duration.ptr())) {
-      // timedelta stores integer days, seconds, microseconds
-      int64_t days, seconds, microseconds;
-      days = PyDateTime_DELTA_GET_DAYS(duration.ptr());
-      seconds = PyDateTime_DELTA_GET_SECONDS(duration.ptr());
-      if (days) {
-        int seconds_per_day = 24 * 3600;
-        seconds += days * seconds_per_day;
-      }
-      microseconds = PyDateTime_DELTA_GET_MICROSECONDS(duration.ptr());
-      if (seconds) { microseconds += 1000000 * seconds; }
-      int64_t delta_ns = 1000 * microseconds;
-      return delta_ns;
-    } else {
-      throw std::runtime_error("expected an integer or datetime.timedelta type");
+  if (py::isinstance<py::int_>(duration)) { return py::cast<int64_t>(duration); }
+  // Must acquire GIL before calling C API functions like PyDelta_Check
+  py::gil_scoped_acquire scope_guard;
+
+  // Must initialize PyDateTime_IMPORT here in order to be able to use PyDelta_Check below
+  // see: https://docs.python.org/3/c-api/datetime.html?highlight=pydelta_check#datetime-objects
+  if (PyDateTimeAPI == nullptr) { PyDateTime_IMPORT; }
+
+  if (PyDelta_Check(duration.ptr())) {
+    // timedelta stores integer days, seconds, microseconds
+    int64_t days = PyDateTime_DELTA_GET_DAYS(duration.ptr());
+    int64_t seconds = PyDateTime_DELTA_GET_SECONDS(duration.ptr());
+    if (days > 0) {
+      int seconds_per_day = 24 * 3600;
+      seconds += days * seconds_per_day;
     }
+    int64_t microseconds = PyDateTime_DELTA_GET_MICROSECONDS(duration.ptr());
+    if (seconds > 0) { microseconds += 1000000 * seconds; }
+    int64_t delta_ns = 1000 * microseconds;
+    return delta_ns;
   }
+  throw std::runtime_error("expected an integer or datetime.timedelta type");
 }
 
 class PyRealtimeClock : public RealtimeClock {
@@ -82,15 +77,15 @@ class PyRealtimeClock : public RealtimeClock {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 
   /* Trampolines (need one for each virtual function) */
-  double time() const override {
+  [[nodiscard]] double time() const override {
     /* <Return type>, <Parent Class>, <Name of C++ function>, <Argument(s)> */
     PYBIND11_OVERRIDE(double, RealtimeClock, time);
   }
-  int64_t timestamp() const override {
+  [[nodiscard]] int64_t timestamp() const override {
     /* <Return type>, <Parent Class>, <Name of C++ function>, <Argument(s)> */
     PYBIND11_OVERRIDE(int64_t, RealtimeClock, timestamp);
   }
@@ -116,15 +111,15 @@ class PyManualClock : public ManualClock {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 
   /* Trampolines (need one for each virtual function) */
-  double time() const override {
+  [[nodiscard]] double time() const override {
     /* <Return type>, <Parent Class>, <Name of C++ function>, <Argument(s)> */
     PYBIND11_OVERRIDE(double, ManualClock, time);
   }
-  int64_t timestamp() const override {
+  [[nodiscard]] int64_t timestamp() const override {
     /* <Return type>, <Parent Class>, <Name of C++ function>, <Argument(s)> */
     PYBIND11_OVERRIDE(int64_t, ManualClock, timestamp);
   }
@@ -139,6 +134,7 @@ class PyManualClock : public ManualClock {
 };
 
 void init_clocks(py::module_& m) {
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<Clock, gxf::GXFResource, std::shared_ptr<Clock>>(m, "Clock", doc::Clock::doc_Clock);
 
   py::class_<RealtimeClock, PyRealtimeClock, Clock, std::shared_ptr<RealtimeClock>>(
diff --git a/python/holoscan/resources/component_serializers.cpp b/python/holoscan/resources/component_serializers.cpp
index 5dae87fa..8e0112ea 100644
--- a/python/holoscan/resources/component_serializers.cpp
+++ b/python/holoscan/resources/component_serializers.cpp
@@ -29,8 +29,8 @@
 #include "holoscan/core/resources/gxf/ucx_component_serializer.hpp"
 #include "holoscan/core/resources/gxf/ucx_holoscan_component_serializer.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -43,12 +43,11 @@ class PyStdComponentSerializer : public StdComponentSerializer {
 
   // Define a constructor that fully initializes the object.
   explicit PyStdComponentSerializer(Fragment* fragment,
-                                    const std::string& name = "std_component_serializer")
-      : StdComponentSerializer() {
+                                    const std::string& name = "std_component_serializer") {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -60,13 +59,12 @@ class PyUcxComponentSerializer : public UcxComponentSerializer {
   // Define a constructor that fully initializes the object.
   explicit PyUcxComponentSerializer(Fragment* fragment,
                                     std::shared_ptr<holoscan::Allocator> allocator = nullptr,
-                                    const std::string& name = "ucx_component_serializer")
-      : UcxComponentSerializer() {
+                                    const std::string& name = "ucx_component_serializer") {
     if (allocator) { this->add_arg(Arg{"allocator", allocator}); }
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -78,13 +76,12 @@ class PyUcxHoloscanComponentSerializer : public UcxHoloscanComponentSerializer {
   // Define a constructor that fully initializes the object.
   explicit PyUcxHoloscanComponentSerializer(
       Fragment* fragment, std::shared_ptr<holoscan::Allocator> allocator = nullptr,
-      const std::string& name = "ucx_holoscan_component_serializer")
-      : UcxHoloscanComponentSerializer() {
+      const std::string& name = "ucx_holoscan_component_serializer") {
     if (allocator) { this->add_arg(Arg{"allocator", allocator}); }
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/resources/entity_serializers.cpp b/python/holoscan/resources/entity_serializers.cpp
index 1c448367..233d28b2 100644
--- a/python/holoscan/resources/entity_serializers.cpp
+++ b/python/holoscan/resources/entity_serializers.cpp
@@ -27,8 +27,8 @@
 #include "holoscan/core/gxf/gxf_resource.hpp"
 #include "holoscan/core/resources/gxf/ucx_entity_serializer.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -52,7 +52,7 @@ class PyUcxEntitySerializer : public UcxEntitySerializer {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/resources/gxf_component_resource.cpp b/python/holoscan/resources/gxf_component_resource.cpp
index e89bfdb9..8dddde28 100644
--- a/python/holoscan/resources/gxf_component_resource.cpp
+++ b/python/holoscan/resources/gxf_component_resource.cpp
@@ -29,8 +29,8 @@
 
 #include "../operators/operator_util.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -44,12 +44,12 @@ class PyGXFComponentResource : public GXFComponentResource {
   using GXFComponentResource::GXFComponentResource;
 
   // Define a constructor that fully initializes the object.
-  PyGXFComponentResource(py::object component, Fragment* fragment, const std::string& gxf_typename,
-                         const std::string& name, const py::kwargs& kwargs)
-      : GXFComponentResource(gxf_typename.c_str()) {
-    py_component_ = component;
-    py_initialize_ = py::getattr(component, "initialize");  // cache the initialize method
-
+  PyGXFComponentResource(const py::object& component, Fragment* fragment,
+                         const std::string& gxf_typename, const std::string& name,
+                         const py::kwargs& kwargs)
+      : GXFComponentResource(gxf_typename.c_str()),
+        py_component_(component),
+        py_initialize_(py::getattr(component, "initialize")) {
     // We don't need to call `add_positional_condition_and_resource_args(this, args);` because
     // Holoscan resources don't accept the positional arguments for Condition and Resource.
     add_kwargs(this, kwargs);
@@ -73,9 +73,9 @@ class PyGXFComponentResource : public GXFComponentResource {
     GXFComponentResource::initialize();
   }
 
- protected:
+ private:
   py::object py_component_ = py::none();
-  py::object py_initialize_ = py::none();
+  py::object py_initialize_ = py::none();  // cache the initialize method
 };
 
 /* Trampoline classes for handling Python kwargs
diff --git a/python/holoscan/resources/receivers.cpp b/python/holoscan/resources/receivers.cpp
index 50af29fe..d71711cd 100644
--- a/python/holoscan/resources/receivers.cpp
+++ b/python/holoscan/resources/receivers.cpp
@@ -29,8 +29,8 @@
 #include "holoscan/core/resources/gxf/receiver.hpp"
 #include "holoscan/core/resources/gxf/ucx_receiver.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -42,13 +42,14 @@ class PyDoubleBufferReceiver : public DoubleBufferReceiver {
   using DoubleBufferReceiver::DoubleBufferReceiver;
 
   // Define a constructor that fully initializes the object.
-  PyDoubleBufferReceiver(Fragment* fragment, uint64_t capacity = 1UL, uint64_t policy = 2UL,
-                         const std::string& name = "double_buffer_receiver")
+  explicit PyDoubleBufferReceiver(Fragment* fragment, uint64_t capacity = 1UL,
+                                  uint64_t policy = 2UL,
+                                  const std::string& name = "double_buffer_receiver")
       : DoubleBufferReceiver(ArgList{Arg{"capacity", capacity}, Arg{"policy", policy}}) {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -58,10 +59,11 @@ class PyUcxReceiver : public UcxReceiver {
   using UcxReceiver::UcxReceiver;
 
   // Define a constructor that fully initializes the object.
-  PyUcxReceiver(Fragment* fragment, std::shared_ptr<UcxSerializationBuffer> buffer = nullptr,
-                uint64_t capacity = 1UL, uint64_t policy = 2UL,
-                const std::string& address = std::string("0.0.0.0"),
-                uint32_t port = kDefaultUcxPort, const std::string& name = "ucx_receiver")
+  explicit PyUcxReceiver(Fragment* fragment,
+                         std::shared_ptr<UcxSerializationBuffer> buffer = nullptr,
+                         uint64_t capacity = 1UL, uint64_t policy = 2UL,
+                         const std::string& address = std::string("0.0.0.0"),
+                         uint32_t port = kDefaultUcxPort, const std::string& name = "ucx_receiver")
       : UcxReceiver(ArgList{Arg{"capacity", capacity},
                             Arg{"policy", policy},
                             Arg{"address", address},
@@ -70,7 +72,7 @@ class PyUcxReceiver : public UcxReceiver {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/resources/resources.cpp b/python/holoscan/resources/resources.cpp
index ddf8f792..699ab8c1 100644
--- a/python/holoscan/resources/resources.cpp
+++ b/python/holoscan/resources/resources.cpp
@@ -18,9 +18,6 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/chrono.h>  // will include timedelta.h for us
 
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
 namespace py = pybind11;
 
 namespace holoscan {
diff --git a/python/holoscan/resources/serialization_buffers.cpp b/python/holoscan/resources/serialization_buffers.cpp
index 33fc9013..22b9a38c 100644
--- a/python/holoscan/resources/serialization_buffers.cpp
+++ b/python/holoscan/resources/serialization_buffers.cpp
@@ -28,8 +28,8 @@
 #include "holoscan/core/resources/gxf/serialization_buffer.hpp"
 #include "holoscan/core/resources/gxf/ucx_serialization_buffer.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -52,7 +52,7 @@ class PySerializationBuffer : public SerializationBuffer {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -73,7 +73,7 @@ class PyUcxSerializationBuffer : public UcxSerializationBuffer {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/resources/std_entity_serializer.cpp b/python/holoscan/resources/std_entity_serializer.cpp
index d2e7e541..cd30d092 100644
--- a/python/holoscan/resources/std_entity_serializer.cpp
+++ b/python/holoscan/resources/std_entity_serializer.cpp
@@ -27,8 +27,8 @@
 #include "holoscan/core/gxf/gxf_resource.hpp"
 #include "holoscan/core/resources/gxf/std_entity_serializer.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -41,12 +41,11 @@ class PyStdEntitySerializer : public StdEntitySerializer {
 
   // Define a constructor that fully initializes the object.
   explicit PyStdEntitySerializer(Fragment* fragment,
-                                 const std::string& name = "std_entity_serializer")
-      : StdEntitySerializer() {
+                                 const std::string& name = "std_entity_serializer") {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/resources/transmitters.cpp b/python/holoscan/resources/transmitters.cpp
index f7b56608..bcb1f62c 100644
--- a/python/holoscan/resources/transmitters.cpp
+++ b/python/holoscan/resources/transmitters.cpp
@@ -29,8 +29,8 @@
 #include "holoscan/core/resources/gxf/transmitter.hpp"
 #include "holoscan/core/resources/gxf/ucx_transmitter.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -42,13 +42,14 @@ class PyDoubleBufferTransmitter : public DoubleBufferTransmitter {
   using DoubleBufferTransmitter::DoubleBufferTransmitter;
 
   // Define a constructor that fully initializes the object.
-  PyDoubleBufferTransmitter(Fragment* fragment, uint64_t capacity = 1UL, uint64_t policy = 2UL,
-                            const std::string& name = "double_buffer_transmitter")
+  explicit PyDoubleBufferTransmitter(Fragment* fragment, uint64_t capacity = 1UL,
+                                     uint64_t policy = 2UL,
+                                     const std::string& name = "double_buffer_transmitter")
       : DoubleBufferTransmitter(ArgList{Arg{"capacity", capacity}, Arg{"policy", policy}}) {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
@@ -58,13 +59,14 @@ class PyUcxTransmitter : public UcxTransmitter {
   using UcxTransmitter::UcxTransmitter;
 
   // Define a constructor that fully initializes the object.
-  PyUcxTransmitter(Fragment* fragment, std::shared_ptr<UcxSerializationBuffer> buffer = nullptr,
-                   uint64_t capacity = 1UL, uint64_t policy = 2UL,
-                   const std::string& receiver_address = std::string("0.0.0.0"),
-                   const std::string& local_address = std::string("0.0.0.0"),
-                   uint32_t port = kDefaultUcxPort, uint32_t local_port = 0,
-                   uint32_t maximum_connection_retries = 10,
-                   const std::string& name = "ucx_transmitter")
+  explicit PyUcxTransmitter(Fragment* fragment,
+                            std::shared_ptr<UcxSerializationBuffer> buffer = nullptr,
+                            uint64_t capacity = 1UL, uint64_t policy = 2UL,
+                            const std::string& receiver_address = std::string("0.0.0.0"),
+                            const std::string& local_address = std::string("0.0.0.0"),
+                            uint32_t port = kDefaultUcxPort, uint32_t local_port = 0,
+                            uint32_t maximum_connection_retries = 10,
+                            const std::string& name = "ucx_transmitter")
       : UcxTransmitter(ArgList{Arg{"capacity", capacity},
                                Arg{"policy", policy},
                                Arg{"receiver_address", receiver_address},
@@ -76,7 +78,7 @@ class PyUcxTransmitter : public UcxTransmitter {
     name_ = name;
     fragment_ = fragment;
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/schedulers/event_based_scheduler.cpp b/python/holoscan/schedulers/event_based_scheduler.cpp
index e8079b8a..983900de 100644
--- a/python/holoscan/schedulers/event_based_scheduler.cpp
+++ b/python/holoscan/schedulers/event_based_scheduler.cpp
@@ -30,8 +30,8 @@
 #include "holoscan/core/resources/gxf/realtime_clock.hpp"
 #include "holoscan/core/schedulers/gxf/event_based_scheduler.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -72,7 +72,7 @@ class PyEventBasedScheduler : public EventBasedScheduler {
       this->add_arg(Arg{"clock", fragment_->make_resource<RealtimeClock>("realtime_clock")});
     }
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/schedulers/greedy_scheduler.cpp b/python/holoscan/schedulers/greedy_scheduler.cpp
index e29b2585..a0fc14ec 100644
--- a/python/holoscan/schedulers/greedy_scheduler.cpp
+++ b/python/holoscan/schedulers/greedy_scheduler.cpp
@@ -30,8 +30,8 @@
 #include "holoscan/core/resources/gxf/realtime_clock.hpp"
 #include "holoscan/core/schedulers/gxf/greedy_scheduler.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -53,10 +53,11 @@ class PyGreedyScheduler : public GreedyScheduler {
   using GreedyScheduler::GreedyScheduler;
 
   // Define a constructor that fully initializes the object.
-  PyGreedyScheduler(Fragment* fragment, std::shared_ptr<Clock> clock = nullptr,
-                    bool stop_on_deadlock = true, int64_t max_duration_ms = -1LL,
-                    double check_recession_period_ms = 5.0, int64_t stop_on_deadlock_timeout = 0LL,
-                    const std::string& name = "greedy_scheduler")
+  explicit PyGreedyScheduler(Fragment* fragment, std::shared_ptr<Clock> clock = nullptr,
+                             bool stop_on_deadlock = true, int64_t max_duration_ms = -1LL,
+                             double check_recession_period_ms = 5.0,
+                             int64_t stop_on_deadlock_timeout = 0LL,
+                             const std::string& name = "greedy_scheduler")
       : GreedyScheduler(ArgList{Arg{"stop_on_deadlock", stop_on_deadlock},
                                 Arg{"check_recession_period_ms", check_recession_period_ms},
                                 Arg{"stop_on_deadlock_timeout", stop_on_deadlock_timeout}}) {
@@ -72,7 +73,7 @@ class PyGreedyScheduler : public GreedyScheduler {
     }
     spec_ = std::make_shared<ComponentSpec>(fragment);
     HOLOSCAN_LOG_TRACE("in PyGreedyScheduler constructor");
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 void init_greedy_scheduler(py::module_& m) {
diff --git a/python/holoscan/schedulers/multithread_scheduler.cpp b/python/holoscan/schedulers/multithread_scheduler.cpp
index 2d187be2..a03b7efa 100644
--- a/python/holoscan/schedulers/multithread_scheduler.cpp
+++ b/python/holoscan/schedulers/multithread_scheduler.cpp
@@ -30,8 +30,8 @@
 #include "holoscan/core/resources/gxf/realtime_clock.hpp"
 #include "holoscan/core/schedulers/gxf/multithread_scheduler.hpp"
 
-using std::string_literals::operator""s;
-using pybind11::literals::operator""_a;
+using std::string_literals::operator""s;  // NOLINT(misc-unused-using-decls)
+using pybind11::literals::operator""_a;   // NOLINT(misc-unused-using-decls)
 
 namespace py = pybind11;
 
@@ -74,7 +74,7 @@ class PyMultiThreadScheduler : public MultiThreadScheduler {
       this->add_arg(Arg{"clock", fragment_->make_resource<RealtimeClock>("realtime_clock")});
     }
     spec_ = std::make_shared<ComponentSpec>(fragment);
-    setup(*spec_.get());
+    setup(*spec_);
   }
 };
 
diff --git a/python/holoscan/schedulers/schedulers.cpp b/python/holoscan/schedulers/schedulers.cpp
index 65a3499e..eaae7089 100644
--- a/python/holoscan/schedulers/schedulers.cpp
+++ b/python/holoscan/schedulers/schedulers.cpp
@@ -30,9 +30,6 @@
 #include "holoscan/core/schedulers/gxf/greedy_scheduler.hpp"
 #include "holoscan/core/schedulers/gxf/multithread_scheduler.hpp"
 
-#define STRINGIFY(x) #x
-#define MACRO_STRINGIFY(x) STRINGIFY(x)
-
 namespace py = pybind11;
 
 namespace holoscan {
diff --git a/python/requirements.txt b/python/requirements.txt
index 3414512f..e3556196 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -6,4 +6,4 @@ Jinja2==3.1.3
 packaging==23.1
 pyyaml==6.0
 requests==2.31.0
-psutil==5.9.6
+psutil==6.0.0
diff --git a/python/tests/cli/unit/common/test_argparse_types.py b/python/tests/cli/unit/common/test_argparse_types.py
index 82fe164f..4c940823 100644
--- a/python/tests/cli/unit/common/test_argparse_types.py
+++ b/python/tests/cli/unit/common/test_argparse_types.py
@@ -19,7 +19,6 @@
 import os
 import pathlib
 from pathlib import PosixPath
-from typing import List
 
 import pytest
 
@@ -114,7 +113,7 @@ class TestValidPlatforms:
             ),
         ],
     )
-    def test_valid_platforms(self, platforms: List[Platform]):
+    def test_valid_platforms(self, platforms: list[Platform]):
         platform_strs = ",".join(x.value for x in platforms)
         result = valid_platforms(platform_strs)
 
diff --git a/python/tests/cli/unit/packager/test_config_reader.py b/python/tests/cli/unit/packager/test_config_reader.py
index ae7e5547..dae69f7a 100644
--- a/python/tests/cli/unit/packager/test_config_reader.py
+++ b/python/tests/cli/unit/packager/test_config_reader.py
@@ -17,7 +17,6 @@
 
 import pathlib
 import tempfile
-from typing import Dict
 
 import pytest
 import yaml
@@ -261,7 +260,7 @@ def test_populate_package_manifest_multiple_models(self):
             assert model in result.models
             assert result.models[model] == str(build_parameters.models_dir / model)
 
-    def _generate_yaml(self) -> Dict:
+    def _generate_yaml(self) -> dict:
         data = {
             "application": {
                 "title": "App Title",
diff --git a/python/tests/cli/unit/packager/test_platforms.py b/python/tests/cli/unit/packager/test_platforms.py
index 74f4ab46..cc0d99ef 100644
--- a/python/tests/cli/unit/packager/test_platforms.py
+++ b/python/tests/cli/unit/packager/test_platforms.py
@@ -56,9 +56,12 @@ def test_invalid_platform_options(self, monkeypatch):
         input_args.holoscan_sdk_file = Path("some-random-file")
 
         platform = Platform(self._artifact_source)
-        with tempfile.TemporaryDirectory(
-            prefix="holoscan_test", dir=tempfile.gettempdir()
-        ) as temp_dir, pytest.raises(IncompatiblePlatformConfigurationError):
+        with (
+            tempfile.TemporaryDirectory(
+                prefix="holoscan_test", dir=tempfile.gettempdir()
+            ) as temp_dir,
+            pytest.raises(IncompatiblePlatformConfigurationError),
+        ):
             platform.configure_platforms(
                 input_args, temp_dir, application_verison, ApplicationType.CppCMake
             )
@@ -83,9 +86,12 @@ def test_invalid_platform_options_holoscan_sdk_type_with_monai_deploy_sdk_file(
         input_args.monai_deploy_sdk_file = Path("some-random-file")
 
         platform = Platform(self._artifact_source)
-        with tempfile.TemporaryDirectory(
-            prefix="holoscan_test", dir=tempfile.gettempdir()
-        ) as temp_dir, pytest.raises(IncompatiblePlatformConfigurationError):
+        with (
+            tempfile.TemporaryDirectory(
+                prefix="holoscan_test", dir=tempfile.gettempdir()
+            ) as temp_dir,
+            pytest.raises(IncompatiblePlatformConfigurationError),
+        ):
             platform.configure_platforms(
                 input_args, temp_dir, application_verison, ApplicationType.CppCMake
             )
diff --git a/python/tests/system/distributed/test_distributed_app_three_ucx_receivers.py b/python/tests/system/distributed/test_distributed_app_three_ucx_receivers.py
index 24f960a1..696c60ad 100644
--- a/python/tests/system/distributed/test_distributed_app_three_ucx_receivers.py
+++ b/python/tests/system/distributed/test_distributed_app_three_ucx_receivers.py
@@ -16,14 +16,16 @@
 """  # noqa: E501
 
 # # Uncomment the following line to use the real HolovizOp and VideoStreamReplayerOp operators
-# import os
+import os
+import shutil
+import tempfile
 
 import numpy as np
 import pytest
 from env_wrapper import env_var_context
 
 from holoscan.conditions import CountCondition
-from holoscan.core import Application, Fragment, IOSpec, Operator, OperatorSpec
+from holoscan.core import Application, Fragment, IOSpec, Operator, OperatorSpec, Tracker
 from utils import remove_ignored_errors
 
 # # Uncomment the following line to use the real HolovizOp and VideoStreamReplayerOp operators
@@ -174,7 +176,7 @@ def compose(self):
 NUM_MSGS = 100
 
 
-def launch_app(use_new_receivers=True):
+def launch_app(use_new_receivers=True, data_flow_tracking=False, logfile=None):
     env_var_settings = {
         # set the recession period to 5 ms to reduce debug messages
         ("HOLOSCAN_CHECK_RECESSION_PERIOD_MS", "5"),
@@ -194,14 +196,42 @@ def launch_app(use_new_receivers=True):
         #                                 "video_replayer_distributed.yaml")
         # app.config(config_file_path)
 
-        app.run()
+        if data_flow_tracking:
+            with Tracker(app, filename=logfile) as trackers:
+                app.run()
+                for fragment_name, tracker in trackers.items():
+                    print(f"Fragment: {fragment_name}")
+                    tracker.print()
+        else:
+            app.run()
 
 
-@pytest.mark.parametrize("use_new_receivers", [True, False])
-def test_distributed_app_three_ucx_receivers(use_new_receivers, capfd):
+@pytest.mark.parametrize("use_new_receivers,", [True, False])
+@pytest.mark.parametrize("data_flow_tracking", [True, False])
+def test_distributed_app_three_ucx_receivers(use_new_receivers, data_flow_tracking, capfd):
     global NUM_MSGS
 
-    launch_app(use_new_receivers=use_new_receivers)
+    # only record the log for the use_new_receivers case so we also test the no-logging code path
+    write_logfile = data_flow_tracking and use_new_receivers
+    logfile_directory = tempfile.mkdtemp() if write_logfile else None
+    try:
+        logfile = logfile_directory + "/holoscan.log" if write_logfile else None
+        launch_app(
+            use_new_receivers=use_new_receivers,
+            data_flow_tracking=data_flow_tracking,
+            logfile=logfile,
+        )
+        if data_flow_tracking and logfile is not None:
+            # verify that the logfile was created and is not empty
+            assert os.path.isfile(logfile)
+            with open(logfile) as f:
+                log_content = "".join(f.readlines())
+            assert "fragment1.replayer" in log_content
+            assert "fragment1.triangle" in log_content
+            assert "fragment1.rectangle" in log_content
+    finally:
+        if logfile_directory is not None:
+            shutil.rmtree(logfile_directory)
 
     # assert that no errors were logged
     captured = capfd.readouterr()
@@ -218,6 +248,14 @@ def test_distributed_app_three_ucx_receivers(use_new_receivers, capfd):
     # assert that the expected number of messages were received
     assert f"Received message {NUM_MSGS} (size: 3)" in captured.out
 
+    if data_flow_tracking:
+        # assert that the data flow tracking messages were printed
+        assert captured.out.count("Data Flow Tracking Results:") == 2
+        # three paths: rectangle, replayer and triangle all connect to HolovizOp
+        assert captured.out.count("Total paths: 3") == 2
+        assert "Fragment: fragment1" in captured.out
+        assert "Fragment: fragment2" in captured.out
+
 
 if __name__ == "__main__":
     launch_app()
diff --git a/python/tests/unit/test_conditions.py b/python/tests/unit/test_conditions.py
index aa619843..165a9e66 100644
--- a/python/tests/unit/test_conditions.py
+++ b/python/tests/unit/test_conditions.py
@@ -24,6 +24,9 @@
     AsynchronousEventState,
     BooleanCondition,
     CountCondition,
+    CudaBufferAvailableCondition,
+    CudaEventCondition,
+    CudaStreamCondition,
     DownstreamMessageAffordableCondition,
     ExpiringMessageAvailableCondition,
     MessageAvailableCondition,
@@ -298,6 +301,86 @@ def test_invalid_recess_period_type(self, app):
             PeriodicCondition(app, recess_period="100s", name="periodic")
 
 
+class TestCudaEventCondition:
+    def test_kwarg_based_initialization(self, app):  # , capfd):
+        name = "cuda_event_condition"
+        event_name = "cuda_event"
+        cond = CudaEventCondition(
+            fragment=app,
+            event_name=event_name,
+            name=name,
+        )
+        assert isinstance(cond, GXFCondition)
+        assert isinstance(cond, Condition)
+        assert cond.gxf_typename == "nvidia::gxf::CudaEventSchedulingTerm"
+
+        assert f"""
+name: {name}
+fragment: ""
+args:
+  - name: event_name
+    type: std::string
+    value: {event_name}
+""" in repr(cond)
+
+        # assert no warnings or errors logged
+        # captured = capfd.readouterr()
+        # assert "error" not in captured.err
+        # assert "warning" not in captured.err
+        pass
+
+    def test_default_initialization(self, app):
+        CudaEventCondition(app)
+
+
+class TestCudaStreamCondition:
+    def test_kwarg_based_initialization(self, app, capfd):
+        name = "cuda_stream_condition"
+        cond = CudaStreamCondition(fragment=app, name=name)
+        assert isinstance(cond, GXFCondition)
+        assert isinstance(cond, Condition)
+        assert cond.gxf_typename == "nvidia::gxf::CudaStreamSchedulingTerm"
+
+        assert f"""
+name: {name}
+fragment: ""
+args:
+  []
+""" in repr(cond)
+
+        # assert no warnings or errors logged
+        captured = capfd.readouterr()
+        assert "error" not in captured.err
+        assert "warning" not in captured.err
+
+    def test_default_initialization(self, app):
+        CudaStreamCondition(app)
+
+
+class TestCudaBufferAvailableCondition:
+    def test_kwarg_based_initialization(self, app, capfd):
+        name = "cuda_buffer_available_condition"
+        cond = CudaBufferAvailableCondition(fragment=app, name=name)
+        assert isinstance(cond, GXFCondition)
+        assert isinstance(cond, Condition)
+        assert cond.gxf_typename == "nvidia::gxf::CudaBufferAvailableSchedulingTerm"
+
+        assert f"""
+name: {name}
+fragment: ""
+args:
+  []
+""" in repr(cond)
+
+        # assert no warnings or errors logged
+        captured = capfd.readouterr()
+        assert "error" not in captured.err
+        assert "warning" not in captured.err
+
+    def test_default_initialization(self, app):
+        CudaBufferAvailableCondition(app)
+
+
 ####################################################################################################
 # Test Ping app with no conditions on Rx operator
 ####################################################################################################
diff --git a/python/tests/unit/test_operators_native.py b/python/tests/unit/test_operators_native.py
index 1a479cdc..c1663d1c 100644
--- a/python/tests/unit/test_operators_native.py
+++ b/python/tests/unit/test_operators_native.py
@@ -1019,7 +1019,7 @@ def setup(self, spec: OperatorSpec):
         spec.input("in")
 
     def compute(self, op_input, op_output, context):
-        # TODO: Holoviz outputs a video buffer, but there is no support for video buffers in Python  # noqa: FIX002, E501
+        # TODO(unknown): Holoviz outputs a video buffer, but there is no support for video buffers in Python  # noqa: FIX002, E501
         # yet
         pass
         # message = op_input.receive("in")
diff --git a/python/tests/unit/test_resources.py b/python/tests/unit/test_resources.py
index 3e57a1b0..1ecdab65 100644
--- a/python/tests/unit/test_resources.py
+++ b/python/tests/unit/test_resources.py
@@ -23,6 +23,7 @@
     Allocator,
     BlockMemoryPool,
     Clock,
+    CudaAllocator,
     CudaStreamPool,
     DoubleBufferReceiver,
     DoubleBufferTransmitter,
@@ -30,9 +31,11 @@
     MemoryStorageType,
     RealtimeClock,
     Receiver,
+    RMMAllocator,
     SerializationBuffer,
     StdComponentSerializer,
     StdEntitySerializer,
+    StreamOrderedAllocator,
     Transmitter,
     UcxComponentSerializer,
     UcxEntitySerializer,
@@ -134,6 +137,65 @@ def test_default_initialization(self, app):
         UnboundedAllocator(app)
 
 
+class TestRMMAllocator:
+    def test_kwarg_based_initialization(self, app, capfd):
+        name = "rmm-pool"
+        pool = RMMAllocator(
+            fragment=app,
+            name=name,
+            # can specify with or without space between number and unit
+            # supported units are B, KB, MB, GB, and TB.
+            device_memory_initial_size="16 MB",
+            device_memory_max_size="32MB",
+            host_memory_initial_size="16.0 MB",
+            host_memory_max_size="32768 KB",
+            dev_id=0,
+        )
+        assert isinstance(pool, CudaAllocator)
+        assert isinstance(pool, Allocator)
+        assert isinstance(pool, GXFResource)
+        assert isinstance(pool, ResourceBase)
+        assert pool.id == -1
+        assert pool.gxf_typename == "nvidia::gxf::RMMAllocator"
+
+        assert f"name: {name}" in repr(pool)
+
+        # assert no warnings or errors logged
+        captured = capfd.readouterr()
+        assert "error" not in captured.err
+
+    def test_default_initialization(self, app):
+        RMMAllocator(app)
+
+
+class TestStreamOrderedAllocator:
+    def test_kwarg_based_initialization(self, app, capfd):
+        name = "stream-orered-pool"
+        pool = StreamOrderedAllocator(
+            fragment=app,
+            name=name,
+            device_memory_initial_size="16MB",
+            device_memory_max_size="32MB",
+            release_threshold="0B",
+            dev_id=0,
+        )
+        assert isinstance(pool, CudaAllocator)
+        assert isinstance(pool, Allocator)
+        assert isinstance(pool, GXFResource)
+        assert isinstance(pool, ResourceBase)
+        assert pool.id == -1
+        assert pool.gxf_typename == "nvidia::gxf::StreamOrderedAllocator"
+
+        assert f"name: {name}" in repr(pool)
+
+        # assert no warnings or errors logged
+        captured = capfd.readouterr()
+        assert "error" not in captured.err
+
+    def test_default_initialization(self, app):
+        StreamOrderedAllocator(app)
+
+
 class TestStdDoubleBufferReceiver:
     def test_kwarg_based_initialization(self, app, capfd):
         name = "db-receiver"
@@ -408,7 +470,7 @@ def test_kwarg_based_initialization(self, app, capfd):
         assert isinstance(res, ResourceBase)
         assert isinstance(res, Receiver)
         assert res.id == -1
-        assert res.gxf_typename == "nvidia::gxf::UcxReceiver"
+        assert res.gxf_typename == "holoscan::HoloscanUcxReceiver"
         assert f"name: {name}" in repr(res)
 
         # assert no warnings or errors logged
@@ -442,7 +504,7 @@ def test_kwarg_based_initialization(self, app, capfd):
         assert isinstance(res, ResourceBase)
         assert isinstance(res, Transmitter)
         assert res.id == -1
-        assert res.gxf_typename == "nvidia::gxf::UcxTransmitter"
+        assert res.gxf_typename == "holoscan::HoloscanUcxTransmitter"
         assert f"name: {name}" in repr(res)
 
         # assert no warnings or errors logged
diff --git a/run b/run
index da7dd5b2..835deb17 100755
--- a/run
+++ b/run
@@ -487,6 +487,9 @@ Arguments:
         a CMakeCache.txt does not exist in the build directory or if CMake detects a reconfigure
         is needed.
         Default: false
+  --tidy [true | false] : Build the SDK with clang-tidy (will be slower)
+        Default: false
+        Associated environment variable: HOLOSCAN_ENABLE_CLANG_TIDY
 '
 }
 build() {
@@ -514,6 +517,7 @@ build() {
     local install_prefix="${CMAKE_INSTALL_PREFIX:-$(get_install_dir)}"
     local reconfigure=false
     local build_libtorch="${HOLOSCAN_BUILD_LIBTORCH:-'ON'}"
+    local enable_clang_tidy="${HOLOSCAN_ENABLE_CLANG_TIDY:-'OFF'}"
     local config_args=""
 
     # Parse args
@@ -529,6 +533,15 @@ build() {
             shift
             shift
         ;;
+        --tidy)
+            tidy_val=$(get_boolean "$2")
+            if [ "$tidy_val" == "true" ]; then
+                enable_clang_tidy='ON'
+            fi
+            reconfigure=true
+            shift
+            shift
+        ;;
         --cudaarchs)
             cuda_archs=$(get_cuda_archs "$2")
             reconfigure=true
@@ -657,6 +670,7 @@ build() {
             if [ ! -f '${build_path}/build.ninja' ] || ${reconfigure} ; then
                 cmake -S . -B ${build_path} -G Ninja \
                     -D HOLOSCAN_BUILD_LIBTORCH=${build_libtorch} \
+                    -D HOLOSCAN_ENABLE_CLANG_TIDY=${enable_clang_tidy} \
                     -D CMAKE_CUDA_ARCHITECTURES='${cuda_archs}' \
                     -D CMAKE_BUILD_TYPE=${build_type} \
                     ${config_args}
@@ -981,7 +995,7 @@ launch() {
     if [ -n "$video_id" ]; then
         groups+=" --group-add $video_id"
     fi
-    render_id=$(get_group_id rnder)
+    render_id=$(get_group_id render)
     if [ -n "$render_id" ]; then
         groups+=" --group-add $render_id"
     fi
diff --git a/runtime_docker/Dockerfile b/runtime_docker/Dockerfile
index a6a458ad..290ec48c 100644
--- a/runtime_docker/Dockerfile
+++ b/runtime_docker/Dockerfile
@@ -25,7 +25,7 @@ FROM ${BUILD_IMAGE} AS build
 # Base Image
 ############################################################
 # CUDA-base image is ~250 MB and contains the CUDA runtime + sets PATH and LD_LIBRARY_PATH
-FROM nvcr.io/nvidia/cuda:12.2.0-base-ubuntu22.04 AS base
+FROM nvcr.io/nvidia/cuda:12.6.0-base-ubuntu22.04 AS base
 
 # Variables inherited by all downstream stages
 ARG HOST_INSTALL_DIR
@@ -56,21 +56,35 @@ FROM base AS runtime_cpp_no_mkl
 #  libv4l2 - V4L2 operator dependency
 #  libpng16-16 - torchvision dependency
 #  libjpeg8 - torchvision dependency
-#  libnvonnxparsers - TensorRT dependency
 #  libnccl2 - libtorch & CuPy dependency
-#  libnvinfer-plugin - TensorRT dependency (Also installs libnvinfer8, libcublas, and libcudnn)
-#  libcublas - TensorRT & OnnxRT dependency, installed by libnvinfer8 (Installed explicitly to ensure correct version)
+#  libnvinfer-plugin - TensorRT dependency
+#  libnvonnxparsers - TensorRT dependency
+#  libnvinfer - TensorRT dependency (Installed explicitly to match required version)
+#  libcublas - OnnxRT dependency
 #  libnpp - Needed by format_converter & bayer_demosaic
 #  libcufft - Holoscan-python-core/OnnxRT dependency
 #  libcurand - libtorch & CuPy dependency
 #  libcusparse - libtorch & CuPy dependency
+#  libcusparseLt - libtorch dependency (Patch package file installs to match TensorRT base container)
 #  cuda-nvrtc - libtorch & CuPy dependency
 #  libnvjitlink - libtorch & CuPy dependency
 #  libcusolver - libtorch & CuPy dependency
 #  cuda-cupti - libtorch & CuPy dependency
 #  cuda-nvtx - libtorch & CuPy dependency
+#  cudnn9-cuda - libtorch & ONNX Runtime dependency
 #  libcudnn_train.so is removed since training is not needed in a runtime environment (saves ~200 MB)
+ARG GPU_TYPE
 RUN apt-get update \
+    && if [ $(uname -m) = "aarch64" ] && [ ${GPU_TYPE} = "dgpu" ]; then \
+        dpkg-divert --rename --divert /usr/local/cuda/lib64/libcusparseLt.so.0.6.2.3 \
+            /usr/lib/sbsa-linux-gnu/libcusparseLt.so.0.6.2.3 \
+        && dpkg-divert --rename --divert /usr/local/cuda/lib64/libcusparseLt.so.0 \
+            /usr/lib/sbsa-linux-gnu/libcusparseLt.so.0 \
+        && dpkg-divert --rename --divert /usr/local/cuda/lib64/libcusparseLt.so \
+            /usr/lib/sbsa-linux-gnu/libcusparseLt.so \
+        && dpkg-divert --rename --divert /usr/local/cuda/include/cusparseLt.h \
+            /usr/include/cusparseLt.h \
+        ; fi \
     && apt-get install --no-install-recommends --allow-downgrades -y \
         libx11-6="2:1.7.5-*" \
         libxcursor1="1:1.2.0-*" \
@@ -89,36 +103,50 @@ RUN apt-get update \
         libv4l-0="1.22.1-*" \
         libpng16-16="1.6.37-*" \
         libjpeg-turbo8="2.1.2-*" \
-        libnvinfer-plugin8="8.6.*+cuda12.0" \
-        libnvonnxparsers8="8.6.*+cuda12.0" \
-        libnccl2="2.19*+cuda12.2" \
-        libcublas-12-2 \
-        libnpp-12-2 \
-        libcufft-12-2 \
-        libcurand-12-2 \
-        libcusparse-12-2 \
-        cuda-nvrtc-12-2 \
-        libnvjitlink-12-2 \
-        libcusolver-12-2 \
-        cuda-cupti-12-2 \
-        cuda-nvtx-12-2 \
+        libnvinfer-plugin10="10.3.*+cuda12.5" \
+        libnvinfer10="10.3.*+cuda12.5" \
+        libnvonnxparsers10="10.3.*+cuda12.5" \
+        libnccl2="2.22*+cuda12.6" \
+        libcublas-12-6 \
+        libnpp-12-6 \
+        libcufft-12-6 \
+        libcurand-12-6 \
+        libcusparse-12-6 \
+        libcusparselt0="0.6.2.3-*" \
+        cuda-nvrtc-12-6 \
+        libnvjitlink-12-6 \
+        libcusolver-12-6 \
+        cuda-cupti-12-6 \
+        cuda-nvtx-12-6 \
+        cudnn9-cuda-12-6 \
     && rm -rf /var/lib/apt/lists/* \
     && rm -f /usr/lib/*/libcudnn*train.so*
 
+# Install NVIDIA Performance Libraries on arm64 dGPU platform
+# as a runtime requirement for the Holoinfer `libtorch` backend (2.5.0).
+ARG GPU_TYPE
+RUN if [ $(uname -m) = "aarch64" ] && [ ${GPU_TYPE} = "dgpu" ]; then \
+    apt-get update \
+    && apt-get install --no-install-recommends -y \
+        nvpl-blas=0.2.0.1-* \
+        nvpl-lapack=0.2.2.1-* \
+    && rm -rf /var/lib/apt/lists/* \
+    ; fi
+
 # Copy ONNX Runtime
-ARG ONNX_RUNTIME_VERSION=1.15.1_23.08
+ARG ONNX_RUNTIME_VERSION=1.18.1_38712740_24.08-cuda-12.6
 ENV ONNX_RUNTIME=/opt/onnxruntime/${ONNX_RUNTIME_VERSION}/lib
 COPY --from=build ${ONNX_RUNTIME} ${ONNX_RUNTIME}
 ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${ONNX_RUNTIME}"
 
 # Copy Libtorch
-ARG LIBTORCH_VERSION=2.1.0_23.08
+ARG LIBTORCH_VERSION=2.5.0_24.08
 ENV LIBTORCH=/opt/libtorch/${LIBTORCH_VERSION}/lib
 COPY --from=build ${LIBTORCH} ${LIBTORCH}
 ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${LIBTORCH}"
 
 # Copy TorchVision
-ARG TORCHVISION_VERSION=0.16.0_23.08
+ARG TORCHVISION_VERSION=0.20.0_24.08
 ENV TORCHVISION=/opt/torchvision/${TORCHVISION_VERSION}/lib
 COPY --from=build ${TORCHVISION} ${TORCHVISION}
 ENV CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${TORCHVISION}"
@@ -130,8 +158,6 @@ COPY --from=build /opt/hpcx/ompi/lib/libopen-rte.so.40 $INSTALL_PATH/lib/libopen
 COPY --from=build /opt/hpcx/ompi/lib/libopen-pal.so.40 $INSTALL_PATH/lib/libopen-pal.so.40
 
 # Install GRDAPI (needed by Holoscan-core)
-ENV os=ubuntu2204
-ENV tag=8.6.1-cuda-12.0
 RUN if [ $(uname -m) = "aarch64" ]; then \
         GDR_REPO_ARCH=aarch64 DEB_ARCH=arm64 CUDA_REPO_ARCH=arm64 ; \
     else \
@@ -139,8 +165,6 @@ RUN if [ $(uname -m) = "aarch64" ]; then \
     fi \
     && apt-get update -y \
     && apt-get install --no-install-recommends -y curl \
-    && curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${CUDA_REPO_ARCH}/cuda-keyring_1.0-1_all.deb \
-    && dpkg -i cuda-keyring_1.0-1_all.deb \
     && curl -O https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.2/ubuntu22_04/${GDR_REPO_ARCH}/libgdrapi_2.4-1_${DEB_ARCH}.Ubuntu22_04.deb \
     && dpkg -i libgdrapi_2.4-1_${DEB_ARCH}.Ubuntu22_04.deb \
     && rm -f libgdrapi_2.4-1_${DEB_ARCH}.Ubuntu22_04.deb \
@@ -164,7 +188,7 @@ WORKDIR $INSTALL_PATH
 #############################################################
 # Runtime C++ + pip + MKL
 #
-# MKL is an x8_64 C++ Holoscan dependency installed with PIP,
+# MKL is an x86_64 C++ Holoscan dependency installed with PIP,
 # so Python must be installed. However, this stage does NOT
 # support running Python Holoscan apps.
 #############################################################
diff --git a/scripts/gxf_entity_codec.py b/scripts/gxf_entity_codec.py
index ebb3ba34..b48360d9 100644
--- a/scripts/gxf_entity_codec.py
+++ b/scripts/gxf_entity_codec.py
@@ -19,13 +19,14 @@
 import os
 import struct
 import time
+from collections.abc import Generator, Sequence
 from enum import Enum
 from io import BufferedIOBase, BytesIO
-from typing import Any, Generator, List, Sequence, Tuple, Union
+from typing import Any, Union
 
 import numpy as np
 
-ArrayLike = Union[np.ndarray, List[float]]
+ArrayLike = Union[np.ndarray, list[float]]
 ReadOnlyBuffer = bytes
 WriteableBuffer = Union[bytearray, memoryview]
 ReadableBuffer = Union[ReadOnlyBuffer, WriteableBuffer]
@@ -54,7 +55,7 @@ class EntityIndex:
     def __init__(
         self,
         *,
-        data: Tuple[int, int, int] = None,
+        data: tuple[int, int, int] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -87,7 +88,7 @@ def __repr__(self) -> str:
     def read(
         self,
         *,
-        data: Tuple[int, int, int] = None,
+        data: tuple[int, int, int] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -170,7 +171,7 @@ class EntityHeader:
     def __init__(
         self,
         *,
-        data: Tuple[int, int, int, int, int, int] = None,
+        data: tuple[int, int, int, int, int, int] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -208,7 +209,7 @@ def __repr__(self) -> str:
     def deserialize(
         self,
         *,
-        data: Tuple[int, int, int, int, int, int] = None,
+        data: tuple[int, int, int, int, int, int] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -310,7 +311,7 @@ class ComponentHeader:
     def __init__(
         self,
         *,
-        data: Tuple[int, int, int, int] = None,
+        data: tuple[int, int, int, int] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -336,7 +337,7 @@ def __repr__(self) -> str:
     def deserialize(
         self,
         *,
-        data: Tuple[int, int, int, int] = None,
+        data: tuple[int, int, int, int] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -462,7 +463,7 @@ class TensorHeader:
     def __init__(
         self,
         *,
-        data: Tuple[int, int, int, int, Tuple[int, ...], Tuple[int, ...]] = None,
+        data: tuple[int, int, int, int, tuple[int, ...], tuple[int, ...]] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -487,11 +488,11 @@ def rank(self) -> int:
         return self._rank
 
     @property
-    def dims(self) -> Tuple[int, ...]:
+    def dims(self) -> tuple[int, ...]:
         return self._dims
 
     @property
-    def strides(self) -> Tuple[int, ...]:
+    def strides(self) -> tuple[int, ...]:
         return self._strides
 
     @property
@@ -504,7 +505,7 @@ def __repr__(self) -> str:
     def deserialize(
         self,
         *,
-        data: Tuple[int, int, int, int, Tuple[int, ...], Tuple[int, ...]] = None,
+        data: tuple[int, int, int, int, tuple[int, ...], tuple[int, ...]] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -585,7 +586,7 @@ class Tensor:
     def __init__(
         self,
         *,
-        data: Tuple[TensorHeader, ArrayLike] = None,
+        data: tuple[TensorHeader, ArrayLike] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -611,7 +612,7 @@ def __repr__(self) -> str:
     def read(
         self,
         *,
-        data: Tuple[TensorHeader, ArrayLike] = None,
+        data: tuple[TensorHeader, ArrayLike] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -678,7 +679,7 @@ class Entity:
     def __init__(
         self,
         *,
-        data: Tuple[EntityHeader, List["Component"]] = None,
+        data: tuple[EntityHeader, list["Component"]] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -691,7 +692,7 @@ def header(self) -> EntityHeader:
         return self._header
 
     @property
-    def components(self) -> List["Component"]:
+    def components(self) -> list["Component"]:
         return self._components
 
     @property
@@ -726,7 +727,7 @@ def create(sequence_number: int, array: ArrayLike) -> None:
     def read(
         self,
         *,
-        data: Tuple[EntityHeader, List["Component"]] = None,
+        data: tuple[EntityHeader, list["Component"]] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -793,7 +794,7 @@ class Component:
     def __init__(
         self,
         *,
-        data: Tuple[ComponentHeader, str, Tensor] = None,
+        data: tuple[ComponentHeader, str, Tensor] = None,
         buffer: Sequence = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
@@ -823,7 +824,7 @@ def __repr__(self) -> str:
     def read(
         self,
         *,
-        data: Tuple[ComponentHeader, str, Tensor] = None,
+        data: tuple[ComponentHeader, str, Tensor] = None,
         buffer: ReadableBuffer = None,
         reader: BufferedIOBase = None,
         offset: int = 0,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 90cabb8e..761df246 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -136,6 +136,9 @@ add_holoscan_library(core
     core/conditions/gxf/asynchronous.cpp
     core/conditions/gxf/boolean.cpp
     core/conditions/gxf/count.cpp
+    core/conditions/gxf/cuda_buffer_available.cpp
+    core/conditions/gxf/cuda_event.cpp
+    core/conditions/gxf/cuda_stream.cpp
     core/conditions/gxf/downstream_affordable.cpp
     core/conditions/gxf/periodic.cpp
     core/conditions/gxf/message_available.cpp
@@ -147,6 +150,7 @@ add_holoscan_library(core
     core/errors.cpp
     core/executors/gxf/gxf_executor.cpp
     core/executors/gxf/gxf_parameter_adaptor.cpp
+    core/flow_tracking_annotation.cpp
     core/fragment.cpp
     core/fragment_scheduler.cpp
     core/graphs/flow_graph.cpp
@@ -176,17 +180,22 @@ add_holoscan_library(core
     core/resources/gxf/annotated_double_buffer_transmitter.cpp
     core/resources/gxf/block_memory_pool.cpp
     core/resources/gxf/clock.cpp
+    core/resources/gxf/cuda_allocator.cpp
     core/resources/gxf/cuda_stream_pool.cpp
     core/resources/gxf/double_buffer_receiver.cpp
     core/resources/gxf/double_buffer_transmitter.cpp
     core/resources/gxf/dfft_collector.cpp
     core/resources/gxf/gxf_component_resource.cpp
+    core/resources/gxf/holoscan_ucx_receiver.cpp
+    core/resources/gxf/holoscan_ucx_transmitter.cpp
     core/resources/gxf/manual_clock.cpp
     core/resources/gxf/realtime_clock.cpp
     core/resources/gxf/receiver.cpp
+    core/resources/gxf/rmm_allocator.cpp
     core/resources/gxf/serialization_buffer.cpp
     core/resources/gxf/std_component_serializer.cpp
     core/resources/gxf/std_entity_serializer.cpp
+    core/resources/gxf/stream_ordered_allocator.cpp
     core/resources/gxf/transmitter.cpp
     core/resources/gxf/ucx_component_serializer.cpp
     core/resources/gxf/ucx_entity_serializer.cpp
@@ -228,6 +237,7 @@ target_link_libraries(core
         GXF::app
         GXF::core
         GXF::cuda
+        GXF::rmm
         GXF::serialization  # for nvidia::gxf::Endpoint
         GXF::std
         GXF::ucx
diff --git a/src/core/app_driver.cpp b/src/core/app_driver.cpp
index 11477b62..19b8b2d4 100644
--- a/src/core/app_driver.cpp
+++ b/src/core/app_driver.cpp
@@ -50,6 +50,7 @@
 #include "holoscan/core/signal_handler.hpp"
 #include "holoscan/core/system/network_utils.hpp"
 #include "holoscan/core/system/system_resource_manager.hpp"
+#include "holoscan/utils/cuda_macros.hpp"
 #include "services/app_worker/client.hpp"
 
 #include "holoscan/logger/logger.hpp"
@@ -212,10 +213,7 @@ void AppDriver::run() {
       if (connection_result) {
         HOLOSCAN_LOG_INFO("Connected to driver");
         // Install signal handler for app worker
-        auto sig_handler = [this](void* context, int signum) {
-          (void)signum;
-          (void)context;
-
+        auto sig_handler = [this]([[maybe_unused]] void* context, [[maybe_unused]] int signum) {
           HOLOSCAN_LOG_ERROR("Interrupted by user for app worker");
 
           auto worker_server = app_->worker().server();
@@ -244,10 +242,7 @@ void AppDriver::run() {
       }
     } else {
       // Install signal handler for app driver
-      auto sig_handler = [this](void* context, int signum) {
-        (void)signum;
-        (void)context;
-
+      auto sig_handler = [this]([[maybe_unused]] void* context, [[maybe_unused]] int signum) {
         HOLOSCAN_LOG_ERROR("Interrupted by user for app driver");
 
         // If the app is already in error state, we set global signal handler.
@@ -255,8 +250,7 @@ void AppDriver::run() {
           HOLOSCAN_LOG_ERROR("Send interrupt once more to terminate immediately");
           SignalHandler::unregister_signal_handler(context, signum);
           // Register the global signal handler.
-          SignalHandler::register_global_signal_handler(signum, [](int sig) {
-            (void)sig;
+          SignalHandler::register_global_signal_handler(signum, []([[maybe_unused]] int sig) {
             HOLOSCAN_LOG_ERROR("Interrupted by user (global signal handler)");
             exit(1);
           });
@@ -681,7 +675,8 @@ bool AppDriver::collect_connections(holoscan::FragmentGraph& fragment_graph) {
         }
       }
     }
-    const auto& frag = worklist.front();
+    // Get (copy) shared pointer before popping it from the worklist.
+    auto frag = worklist.front();
     const auto& frag_name = frag->name();
     worklist.pop_front();
 
@@ -1249,9 +1244,18 @@ std::future<void> AppDriver::launch_fragments_async(
   // Disable CUDA Interprocess Communication (issue 4318442)
   set_ucx_to_exclude_cuda_ipc();
 
+  int gpu_count = 0;
+  cudaError_t cuda_err = HOLOSCAN_CUDA_CALL_WARN_MSG(
+      cudaGetDeviceCount(&gpu_count), "Initializing UcxContext with support for CPU data only");
+  if (cuda_err == cudaSuccess) {
+    HOLOSCAN_LOG_DEBUG("Detected {} GPU(s), initializing UcxContext with GPU support", gpu_count);
+  }
+
   // Add the UCX network context
+  bool enable_async = get_bool_env_var("HOLOSCAN_UCX_ASYNCHRONOUS", true);
   for (auto& fragment : target_fragments) {
-    auto network_context = fragment->make_network_context<holoscan::UcxContext>("ucx_context");
+    auto network_context = fragment->make_network_context<holoscan::UcxContext>(
+        "ucx_context", Arg("cpu_data_only", gpu_count == 0), Arg("enable_async", enable_async));
     fragment->network_context(network_context);
   }
 
diff --git a/src/core/app_worker.cpp b/src/core/app_worker.cpp
index f70e7747..46d1a89a 100644
--- a/src/core/app_worker.cpp
+++ b/src/core/app_worker.cpp
@@ -34,6 +34,7 @@
 #include "holoscan/core/network_contexts/gxf/ucx_context.hpp"
 #include "holoscan/core/schedulers/gxf/multithread_scheduler.hpp"
 #include "holoscan/core/services/app_worker/server.hpp"
+#include "holoscan/utils/cuda_macros.hpp"
 
 #include "holoscan/logger/logger.hpp"
 
@@ -120,9 +121,18 @@ bool AppWorker::execute_fragments(
     }
   }
 
+  int gpu_count = 0;
+  cudaError_t cuda_err = HOLOSCAN_CUDA_CALL_WARN_MSG(
+      cudaGetDeviceCount(&gpu_count), "Initializing UcxContext with support for CPU data only");
+  if (cuda_err == cudaSuccess) {
+    HOLOSCAN_LOG_DEBUG("Detected {} GPU(s), initializing UcxContext with GPU support", gpu_count);
+  }
+
   // Add the UCX network context
+  bool enable_async = AppDriver::get_bool_env_var("HOLOSCAN_UCX_ASYNCHRONOUS", true);
   for (auto& fragment : scheduled_fragments) {
-    auto network_context = fragment->make_network_context<holoscan::UcxContext>("ucx_context");
+    auto network_context = fragment->make_network_context<holoscan::UcxContext>(
+        "ucx_context", Arg("cpu_data_only", gpu_count == 0), Arg("enable_async", enable_async));
     fragment->network_context(network_context);
   }
 
diff --git a/src/core/application.cpp b/src/core/application.cpp
index d77c8056..e3e39ee1 100644
--- a/src/core/application.cpp
+++ b/src/core/application.cpp
@@ -22,11 +22,13 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "holoscan/core/app_driver.hpp"
 #include "holoscan/core/config.hpp"
+#include "holoscan/core/dataflow_tracker.hpp"
 #include "holoscan/core/executor.hpp"
 #include "holoscan/core/graphs/flow_graph.hpp"
 #include "holoscan/core/operator.hpp"
@@ -220,6 +222,24 @@ std::future<void> Application::run_async() {
   return driver().run_async();
 }
 
+std::unordered_map<std::string, DataFlowTracker*> Application::track_distributed(
+    uint64_t num_start_messages_to_skip, uint64_t num_last_messages_to_discard,
+    int latency_threshold) {
+  if (!is_composed_) { compose_graph(); }
+  std::unordered_map<std::string, DataFlowTracker*> trackers;
+  auto& frag_graph = fragment_graph();
+  // iterate over all nodes in frag_graph
+  for (const auto& each_fragment : frag_graph.get_nodes()) {
+    // if track has not been called on the fragment, then call the tracker
+    if (!each_fragment->data_flow_tracker()) {
+      each_fragment->track(
+          num_start_messages_to_skip, num_last_messages_to_discard, latency_threshold);
+    }
+    trackers[each_fragment->name()] = each_fragment->data_flow_tracker();
+  }
+  return trackers;
+}
+
 AppDriver& Application::driver() {
   if (!app_driver_) { app_driver_ = std::make_shared<AppDriver>(this); }
   return *app_driver_;
@@ -374,7 +394,7 @@ void Application::set_scheduler_for_fragments(std::vector<FragmentNodeType>& tar
       // If it is, then we should use the default scheduler.
       // Otherwise, we should set new multi-thread scheduler.
 
-      // TODO: consider use of event-based scheduler?
+      // TODO(unknown): consider use of event-based scheduler?
       auto multi_thread_scheduler =
           std::dynamic_pointer_cast<holoscan::MultiThreadScheduler>(scheduler);
       if (!multi_thread_scheduler) { scheduler_setting = SchedulerType::kMultiThread; }
@@ -416,7 +436,7 @@ void Application::set_scheduler_for_fragments(std::vector<FragmentNodeType>& tar
         scheduler =
             fragment->make_scheduler<holoscan::EventBasedScheduler>("event-based-scheduler");
         unsigned int num_processors = std::thread::hardware_concurrency();
-        // TODO: check number of threads setting needed for event-based scheduler
+        // TODO(unknown): check number of threads setting needed for event-based scheduler
         // Currently, we use the number of operators in the fragment as the number of worker threads
         int64_t worker_thread_number =
             std::min(fragment->graph().get_nodes().size(), static_cast<size_t>(num_processors));
diff --git a/src/core/cli_options.cpp b/src/core/cli_options.cpp
index 607e3fe0..cc1e8098 100644
--- a/src/core/cli_options.cpp
+++ b/src/core/cli_options.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -35,8 +35,8 @@ static bool is_valid_ipv4(const std::string& ip) {
 }
 
 static bool is_valid_ipv6(const std::string& ip) {
-  struct sockaddr_in sa;
-  return inet_pton(AF_INET6, ip.c_str(), &(sa.sin_addr)) != 0;
+  struct sockaddr_in6 sa;
+  return inet_pton(AF_INET6, ip.c_str(), &(sa.sin6_addr)) != 0;
 }
 
 std::string CLIOptions::resolve_hostname(const std::string& hostname) {
diff --git a/src/core/conditions/gxf/asynchronous.cpp b/src/core/conditions/gxf/asynchronous.cpp
index 07b19dff..f8944ab6 100644
--- a/src/core/conditions/gxf/asynchronous.cpp
+++ b/src/core/conditions/gxf/asynchronous.cpp
@@ -31,10 +31,6 @@ nvidia::gxf::AsynchronousSchedulingTerm* AsynchronousCondition::get() const {
   return static_cast<nvidia::gxf::AsynchronousSchedulingTerm*>(gxf_cptr_);
 }
 
-void AsynchronousCondition::setup(ComponentSpec& spec) {
-  (void)spec;  // no parameters to set
-}
-
 void AsynchronousCondition::event_state(AsynchronousEventState state) {
   auto asynchronous_scheduling_term = get();
   if (asynchronous_scheduling_term) { asynchronous_scheduling_term->setEventState(state); }
diff --git a/src/core/conditions/gxf/cuda_buffer_available.cpp b/src/core/conditions/gxf/cuda_buffer_available.cpp
new file mode 100644
index 00000000..533b813f
--- /dev/null
+++ b/src/core/conditions/gxf/cuda_buffer_available.cpp
@@ -0,0 +1,41 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/conditions/gxf/cuda_buffer_available.hpp"
+
+#include <string>
+
+#include "holoscan/core/component_spec.hpp"
+
+namespace holoscan {
+
+CudaBufferAvailableCondition::CudaBufferAvailableCondition(
+    const std::string& name, nvidia::gxf::CudaBufferAvailableSchedulingTerm* term)
+    : GXFCondition(name, term) {}
+
+nvidia::gxf::CudaBufferAvailableSchedulingTerm* CudaBufferAvailableCondition::get() const {
+  return static_cast<nvidia::gxf::CudaBufferAvailableSchedulingTerm*>(gxf_cptr_);
+}
+
+void CudaBufferAvailableCondition::setup(ComponentSpec& spec) {
+  spec.param(receiver_,
+             "receiver",
+             "Queue channel",
+             "The receiver on which data will be available oncethe stream completes.");
+}
+
+}  // namespace holoscan
diff --git a/src/core/conditions/gxf/cuda_event.cpp b/src/core/conditions/gxf/cuda_event.cpp
new file mode 100644
index 00000000..96f77ab4
--- /dev/null
+++ b/src/core/conditions/gxf/cuda_event.cpp
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/conditions/gxf/cuda_event.hpp"
+
+#include <string>
+
+#include "holoscan/core/component_spec.hpp"
+
+namespace holoscan {
+
+CudaEventCondition::CudaEventCondition(const std::string& name,
+                                       nvidia::gxf::CudaEventSchedulingTerm* term)
+    : GXFCondition(name, term) {}
+
+nvidia::gxf::CudaEventSchedulingTerm* CudaEventCondition::get() const {
+  return static_cast<nvidia::gxf::CudaEventSchedulingTerm*>(gxf_cptr_);
+}
+
+void CudaEventCondition::setup(ComponentSpec& spec) {
+  spec.param(receiver_,
+             "receiver",
+             "Queue channel",
+             "The receiver on which data will be available oncethe stream completes.");
+  spec.param(event_name_,
+             "event_name",
+             "Event name",
+             "The event name on which the cudaEventQuery API is called to get the status",
+             std::string(""));
+}
+
+}  // namespace holoscan
diff --git a/src/core/conditions/gxf/cuda_stream.cpp b/src/core/conditions/gxf/cuda_stream.cpp
new file mode 100644
index 00000000..d4af25f4
--- /dev/null
+++ b/src/core/conditions/gxf/cuda_stream.cpp
@@ -0,0 +1,41 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/conditions/gxf/cuda_stream.hpp"
+
+#include <string>
+
+#include "holoscan/core/component_spec.hpp"
+
+namespace holoscan {
+
+CudaStreamCondition::CudaStreamCondition(const std::string& name,
+                                         nvidia::gxf::CudaStreamSchedulingTerm* term)
+    : GXFCondition(name, term) {}
+
+nvidia::gxf::CudaStreamSchedulingTerm* CudaStreamCondition::get() const {
+  return static_cast<nvidia::gxf::CudaStreamSchedulingTerm*>(gxf_cptr_);
+}
+
+void CudaStreamCondition::setup(ComponentSpec& spec) {
+  spec.param(receiver_,
+             "receiver",
+             "Queue channel",
+             "The receiver on which data will be available oncethe stream completes.");
+}
+
+}  // namespace holoscan
diff --git a/src/core/dataflow_tracker.cpp b/src/core/dataflow_tracker.cpp
index a617f786..41004b1d 100644
--- a/src/core/dataflow_tracker.cpp
+++ b/src/core/dataflow_tracker.cpp
@@ -60,9 +60,13 @@ void DataFlowTracker::print() const {
     std::cout << "\n";
   }
 
-  std::cout << "Number of source messages [format: source operator->transmitter name: number of "
-               "messages]:\n";
-  for (const auto& it : source_messages_) { std::cout << it.first << ": " << it.second << "\n"; }
+  if (source_messages_.empty()) {
+    std::cout << "No source messages found.\n";
+  } else {
+    std::cout << "Number of source messages [format: source operator->transmitter name: number of "
+                 "messages]:\n";
+    for (const auto& it : source_messages_) { std::cout << it.first << ": " << it.second << "\n"; }
+  }
 
   std::cout.flush();  // flush standard output; otherwise output may not be printed
 }
diff --git a/src/core/executors/gxf/gxf_executor.cpp b/src/core/executors/gxf/gxf_executor.cpp
index 181ca6fa..1a93753f 100644
--- a/src/core/executors/gxf/gxf_executor.cpp
+++ b/src/core/executors/gxf/gxf_executor.cpp
@@ -64,6 +64,8 @@
 #include "holoscan/core/resources/gxf/dfft_collector.hpp"
 #include "holoscan/core/resources/gxf/double_buffer_receiver.hpp"
 #include "holoscan/core/resources/gxf/double_buffer_transmitter.hpp"
+#include "holoscan/core/resources/gxf/holoscan_ucx_receiver.hpp"
+#include "holoscan/core/resources/gxf/holoscan_ucx_transmitter.hpp"
 #include "holoscan/core/services/common/forward_op.hpp"
 #include "holoscan/core/services/common/virtual_operator.hpp"
 #include "holoscan/core/signal_handler.hpp"
@@ -102,8 +104,8 @@ std::pair<uint64_t, uint64_t> get_capacity_and_policy(
 }
 
 bool has_ucx_connector(std::shared_ptr<nvidia::gxf::GraphEntity> graph_entity) {
-  auto has_ucx_receiver = graph_entity->try_get("nvidia::gxf::UcxReceiver");
-  auto has_ucx_transmitter = graph_entity->try_get("nvidia::gxf::UcxTransmitter");
+  auto has_ucx_receiver = graph_entity->try_get("holoscan::HoloscanUcxReceiver");
+  auto has_ucx_transmitter = graph_entity->try_get("holoscan::HoloscanUcxTransmitter");
   return has_ucx_receiver || has_ucx_transmitter;
 }
 
@@ -113,6 +115,7 @@ static const std::vector<std::string> kDefaultGXFExtensions{
     "libgxf_std.so",
     "libgxf_cuda.so",
     "libgxf_multimedia.so",
+    "libgxf_rmm.so",
     "libgxf_serialization.so",
     "libgxf_ucx.so",  // UcxContext, UcxReceiver, UcxTransmitter, etc.
 };
@@ -385,7 +388,7 @@ void bind_input_port(Fragment* fragment, gxf_context_t gxf_context, gxf_uid_t ei
                      const char* rx_name, IOSpec::ConnectorType rx_type, Operator* op) {
   // Can't currently use GraphEntity API for this OperatorWrapper/bind_port code path
   if (rx_type != IOSpec::ConnectorType::kDefault) {
-    // TODO: update bind_port code path for types other than ConnectorType::kDefault
+    // TODO(unknown): update bind_port code path for types other than ConnectorType::kDefault
     throw std::runtime_error(fmt::format(
         "Unable to support types other than ConnectorType::kDefault (rx_name: '{}')", rx_name));
   }
@@ -469,14 +472,6 @@ void GXFExecutor::create_input_port(Fragment* fragment, gxf_context_t gxf_contex
   const char* rx_name = io_spec->name().c_str();  // input port name
   auto rx_type = io_spec->connector_type();
 
-  if (fragment->data_flow_tracker()) {
-    if ((rx_type != IOSpec::ConnectorType::kDefault) &&
-        (rx_type != IOSpec::ConnectorType::kDoubleBuffer)) {
-      throw std::runtime_error(
-          "Currently the data flow tracking feature requires ConnectorType::kDefault or "
-          "ConnectorType::kDoubleBuffer.");
-    }
-  }
   auto graph_entity = op->graph_entity();
 
   // If this executor is used by OperatorWrapper (bind_port == true) to wrap Native Operator,
@@ -556,9 +551,6 @@ void GXFExecutor::create_input_port(Fragment* fragment, gxf_context_t gxf_contex
         break;
       case IOSpec::ConnectorType::kUCX:
         rx_resource = std::dynamic_pointer_cast<Receiver>(io_spec->connector());
-        if (fragment->data_flow_tracker()) {
-          HOLOSCAN_LOG_ERROR("data flow tracking not implemented for UCX ports");
-        }
         break;
       default:
         HOLOSCAN_LOG_ERROR("Unsupported GXF connector_type: '{}'", static_cast<int>(rx_type));
@@ -587,20 +579,23 @@ void GXFExecutor::create_input_port(Fragment* fragment, gxf_context_t gxf_contex
     rx_resource->add_to_graph_entity(op);
 
     if (fragment->data_flow_tracker()) {
-      holoscan::AnnotatedDoubleBufferReceiver* dbl_ptr;
+      holoscan::AnnotatedDoubleBufferReceiver* dbl_buffer_ptr;
+      holoscan::HoloscanUcxReceiver* ucx_ptr;
       switch (rx_type) {
         case IOSpec::ConnectorType::kDefault:
         case IOSpec::ConnectorType::kDoubleBuffer:
-          dbl_ptr =
+          dbl_buffer_ptr =
               reinterpret_cast<holoscan::AnnotatedDoubleBufferReceiver*>(rx_resource->gxf_cptr());
-          dbl_ptr->op(op);
+          dbl_buffer_ptr->op(op);
           break;
         case IOSpec::ConnectorType::kUCX:
-          HOLOSCAN_LOG_ERROR("UCX-based receiver doesn't currently support data flow tracking");
+          std::dynamic_pointer_cast<UcxReceiver>(rx_resource)->track();
+          ucx_ptr = reinterpret_cast<holoscan::HoloscanUcxReceiver*>(rx_resource->gxf_cptr());
+          ucx_ptr->op(op);
           break;
         default:
           HOLOSCAN_LOG_ERROR(
-              "Annotated data flow tracking not implemented for GXF "
+              "Data flow tracking not implemented for GXF "
               "connector_type: '{}'",
               static_cast<int>(rx_type));
       }
@@ -660,7 +655,7 @@ void GXFExecutor::create_input_port(Fragment* fragment, gxf_context_t gxf_contex
         // No condition
         break;
       default:
-        throw std::runtime_error("Unsupported condition type");  // TODO: use std::expected
+        throw std::runtime_error("Unsupported condition type");  // TODO(unknown): use std::expected
     }
   }
 }
@@ -679,7 +674,7 @@ namespace {
 void bind_output_port(Fragment* fragment, gxf_context_t gxf_context, gxf_uid_t eid, IOSpec* io_spec,
                       const char* tx_name, IOSpec::ConnectorType tx_type, Operator* op) {
   if (tx_type != IOSpec::ConnectorType::kDefault) {
-    // TODO: update bind_port code path for types other than ConnectorType::kDefault
+    // TODO(unknown): update bind_port code path for types other than ConnectorType::kDefault
     throw std::runtime_error(fmt::format(
         "Unable to support types other than ConnectorType::kDefault (tx_name: '{}')", tx_name));
   }
@@ -761,14 +756,6 @@ void GXFExecutor::create_output_port(Fragment* fragment, gxf_context_t gxf_conte
   const char* tx_name = io_spec->name().c_str();
   auto tx_type = io_spec->connector_type();
 
-  if (fragment->data_flow_tracker()) {
-    if ((tx_type != IOSpec::ConnectorType::kDefault) &&
-        (tx_type != IOSpec::ConnectorType::kDoubleBuffer)) {
-      throw std::runtime_error(
-          "Currently the data flow tracking feature requires ConnectorType::kDefault or "
-          "ConnectorType::kDoubleBuffer.");
-    }
-  }
   auto graph_entity = op->graph_entity();
   // If this executor is used by OperatorWrapper (bind_port == true) to wrap Native Operator,
   // then we need to call `bind_output_port` to set the existing GXF Transmitter for this output.
@@ -803,9 +790,6 @@ void GXFExecutor::create_output_port(Fragment* fragment, gxf_context_t gxf_conte
         break;
       case IOSpec::ConnectorType::kUCX:
         tx_resource = std::dynamic_pointer_cast<Transmitter>(io_spec->connector());
-        if (fragment->data_flow_tracker()) {
-          HOLOSCAN_LOG_ERROR("data flow tracking not implemented for UCX ports");
-        }
         break;
       default:
         HOLOSCAN_LOG_ERROR("Unsupported GXF connector_type: '{}'", static_cast<int>(tx_type));
@@ -820,20 +804,23 @@ void GXFExecutor::create_output_port(Fragment* fragment, gxf_context_t gxf_conte
     tx_resource->add_to_graph_entity(op);
 
     if (fragment->data_flow_tracker()) {
-      holoscan::AnnotatedDoubleBufferTransmitter* dbl_ptr;
+      holoscan::AnnotatedDoubleBufferTransmitter* dbl_buffer_ptr;
+      holoscan::HoloscanUcxTransmitter* ucx_ptr;
       switch (tx_type) {
         case IOSpec::ConnectorType::kDefault:
         case IOSpec::ConnectorType::kDoubleBuffer:
-          dbl_ptr = reinterpret_cast<holoscan::AnnotatedDoubleBufferTransmitter*>(
+          dbl_buffer_ptr = reinterpret_cast<holoscan::AnnotatedDoubleBufferTransmitter*>(
               tx_resource->gxf_cptr());
-          dbl_ptr->op(op);
+          dbl_buffer_ptr->op(op);
           break;
         case IOSpec::ConnectorType::kUCX:
-          HOLOSCAN_LOG_ERROR("UCX-based receiver doesn't currently support data flow tracking");
+          std::dynamic_pointer_cast<UcxTransmitter>(tx_resource)->track();
+          ucx_ptr = reinterpret_cast<holoscan::HoloscanUcxTransmitter*>(tx_resource->gxf_cptr());
+          ucx_ptr->op(op);
           break;
         default:
           HOLOSCAN_LOG_ERROR(
-              "Annotated data flow tracking not implemented for GXF "
+              "Data flow tracking not implemented for GXF "
               "connector_type: '{}'",
               static_cast<int>(tx_type));
       }
@@ -877,7 +864,7 @@ void GXFExecutor::create_output_port(Fragment* fragment, gxf_context_t gxf_conte
         // No condition
         break;
       default:
-        throw std::runtime_error("Unsupported condition type");  // TODO: use std::expected
+        throw std::runtime_error("Unsupported condition type");  // TODO(unknown): use std::expected
     }
   }
 }
@@ -1432,7 +1419,8 @@ bool GXFExecutor::initialize_fragment() {
         }
       }
     }
-    const auto& op = worklist.front();
+    // Get (copy) shared pointer before popping it from the worklist.
+    auto op = worklist.front();
     worklist.pop_front();
 
     auto op_spec = op->spec();
@@ -1806,7 +1794,7 @@ bool GXFExecutor::initialize_gxf_graph(OperatorGraph& graph) {
         bool codelet_statistics =
             AppDriver::get_bool_env_var("HOLOSCAN_GXF_JOB_STATISTICS_CODELET", false);
         uint32_t event_history_count =
-            AppDriver::get_int_env_var("HOLOSCAN_GXF_JOB_STATISTICS_COUNT", 100u);
+            AppDriver::get_int_env_var("HOLOSCAN_GXF_JOB_STATISTICS_COUNT", 100U);
 
         // GXF issue 4552622: can't create FilePath Arg, so we call setParameter below instead
         std::vector<nvidia::gxf::Arg> jobstats_args{
@@ -1855,11 +1843,22 @@ bool GXFExecutor::initialize_gxf_graph(OperatorGraph& graph) {
 
       // Identify leaf and root operators and add to the DFFTCollector object
       for (auto& op : graph.get_nodes()) {
-        if (op->is_leaf()) {
-          dfft_collector_ptr->add_leaf_op(op.get());
-        } else if (op->is_root() || op->is_user_defined_root()) {
-          dfft_collector_ptr->add_root_op(op.get());
-        }
+        bool is_current_op_leaf =
+            op->is_leaf() ||
+            holoscan::Operator::is_all_operator_successor_virtual(op, fragment_->graph());
+        bool is_current_op_root =
+            op->is_root() || op->is_user_defined_root() ||
+            holoscan::Operator::is_all_operator_predecessor_virtual(op, fragment_->graph());
+        HOLOSCAN_LOG_DEBUG("fragment: {}, operator {}, id: {}, leaf: {}, root: {}",
+                           fragment_->name(),
+                           op->name(),
+                           op->id(),
+                           is_current_op_leaf,
+                           is_current_op_root);
+        if (is_current_op_leaf) { dfft_collector_ptr->add_leaf_op(op.get()); }
+        // root and leaf operators may also be the same if there is only one operator in a
+        // fragment
+        if (is_current_op_root) { dfft_collector_ptr->add_root_op(op.get()); }
       }
     }
 
@@ -1981,16 +1980,14 @@ void GXFExecutor::run_gxf_graph() {
   auto context = context_;
 
   // Install signal handler
-  auto sig_handler = [](void* context, int signum) {
-    (void)signum;
+  auto sig_handler = [](void* context, [[maybe_unused]] int signum) {
     gxf_result_t code = GxfGraphInterrupt(context);
     if (code != GXF_SUCCESS) {
       HOLOSCAN_LOG_ERROR("GxfGraphInterrupt Error: {}", GxfResultStr(code));
       HOLOSCAN_LOG_ERROR("Send interrupt once more to terminate immediately");
       SignalHandler::unregister_signal_handler(context, signum);
       // Register the global signal handler.
-      SignalHandler::register_global_signal_handler(signum, [](int sig) {
-        (void)sig;
+      SignalHandler::register_global_signal_handler(signum, []([[maybe_unused]] int sig) {
         HOLOSCAN_LOG_ERROR("Interrupted by user (global signal handler)");
         exit(1);
       });
@@ -2015,7 +2012,7 @@ void GXFExecutor::run_gxf_graph() {
   }
   is_gxf_graph_activated_ = false;
 
-  // TODO: do we want to move the log level of these info messages to debug?
+  // TODO(unknown): do we want to move the log level of these info messages to debug?
   HOLOSCAN_LOG_INFO("{}Graph execution finished.", frag_name_display);
 
   // clean up any shared pointers to graph entities within operators, scheulder, network context
@@ -2092,6 +2089,12 @@ void GXFExecutor::register_extensions() {
                                     nvidia::gxf::DoubleBufferTransmitter>(
         "Holoscan's annotated double buffer transmitter", {0x444505a86c014d90, 0xab7503bcd0782877});
 
+    extension_factory.add_component<holoscan::HoloscanUcxReceiver, nvidia::gxf::UcxReceiver>(
+        "Holoscan's annotated ucx receiver", {0x9c8026256e4a4303, 0x865df1fe4428ed32});
+
+    extension_factory.add_component<holoscan::HoloscanUcxTransmitter, nvidia::gxf::UcxTransmitter>(
+        "Holoscan's annotated ucx transmitter", {0x01dbcc609f0942f9, 0x8e04927ac35a6f24});
+
     extension_factory.add_type<holoscan::MessageLabel>("Holoscan message Label",
                                                        {0x6e09e888ccfa4a32, 0xbc501cd20c8b4337});
 
diff --git a/src/core/flow_tracking_annotation.cpp b/src/core/flow_tracking_annotation.cpp
new file mode 100644
index 00000000..f0f7ff1e
--- /dev/null
+++ b/src/core/flow_tracking_annotation.cpp
@@ -0,0 +1,153 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/flow_tracking_annotation.hpp"
+
+#include <utility>
+
+#include "holoscan/core/fragment.hpp"
+#include "holoscan/core/gxf/gxf_utils.hpp"
+#include "holoscan/core/message.hpp"
+#include "holoscan/core/messagelabel.hpp"
+#include "holoscan/core/operator.hpp"
+#include "holoscan/logger/logger.hpp"
+
+namespace holoscan {
+
+gxf_result_t annotate_message(gxf_uid_t uid, const gxf_context_t& context, Operator* op,
+                              const char* transmitter_name) {
+  HOLOSCAN_LOG_DEBUG("annotate_message");
+  if (!op) {
+    HOLOSCAN_LOG_ERROR("Operator is nullptr. Transmitter: {}", transmitter_name);
+    return GXF_FAILURE;
+  } else if (op->operator_type() == Operator::OperatorType::kVirtual) {
+    HOLOSCAN_LOG_DEBUG("Virtual Operators are not timestamped.");
+    return GXF_SUCCESS;
+  } else {
+    auto gxf_entity = nvidia::gxf::Entity::Shared(context, uid);
+    if (!gxf_entity) {
+      HOLOSCAN_LOG_ERROR("Failed to get GXF Entity with uid: {}", uid);
+      return gxf_entity.error();
+    }
+    // GXF Entity is activated by CreateTensorMap function but it generally is not activated for
+    // message Entity. Therefore, we should not need to deactivate the received GXF Entity.
+    // Previously, it was done to resolve the issue in CreateTensorMap. It is being commented out in
+    // expectation of removal of the Entity activation in CreateTensorMap.
+    // gxf_entity->deactivate();
+    MessageLabel m;
+    m = std::move(op->get_consolidated_input_label());
+    m.update_last_op_publish();
+
+    // Check if a message_label component already exists in the entity
+    static gxf_tid_t message_label_tid = GxfTidNull();
+    if (message_label_tid == GxfTidNull()) {
+      GxfComponentTypeId(context, "holoscan::MessageLabel", &message_label_tid);
+    }
+    // If a message_label component already exists in the entity, just update the value of the
+    // MessageLabel
+    if (gxf::has_component(context, uid, message_label_tid, "message_label")) {
+      HOLOSCAN_LOG_DEBUG(
+          "Found a message label already inside the entity. Replacing the original with a new "
+          "one with timestamp.");
+      auto maybe_buffer = gxf_entity.value().get<MessageLabel>("message_label");
+      if (!maybe_buffer) {
+        // Fail early if we cannot add the MessageLabel
+        HOLOSCAN_LOG_ERROR(GxfResultStr(maybe_buffer.error()));
+        return maybe_buffer.error();
+      }
+      *maybe_buffer.value() = m;
+    } else {  // if no message_label component exists in the entity, add a new one
+      auto maybe_buffer = gxf_entity.value().add<MessageLabel>("message_label");
+      if (!maybe_buffer) {
+        // Fail early if we cannot add the MessageLabel
+        HOLOSCAN_LOG_ERROR(GxfResultStr(maybe_buffer.error()));
+        return maybe_buffer.error();
+      }
+      *maybe_buffer.value() = m;
+    }
+  }
+  return GXF_SUCCESS;
+}
+
+gxf_result_t deannotate_message(gxf_uid_t* uid, const gxf_context_t& context, Operator* op,
+                                const char* receiver_name) {
+  HOLOSCAN_LOG_DEBUG("deannotate_message");
+  if (!op) {
+    HOLOSCAN_LOG_ERROR("Operator is nullptr. Receiver: {}", receiver_name);
+    return GXF_FAILURE;
+  } else if (op->operator_type() == Operator::OperatorType::kVirtual) {
+    HOLOSCAN_LOG_DEBUG("Virtual Operators are not timestamped.");
+    return GXF_SUCCESS;
+  }
+  static gxf_tid_t message_label_tid = GxfTidNull();
+  if (message_label_tid == GxfTidNull()) {
+    HOLOSCAN_GXF_CALL(GxfComponentTypeId(context, "holoscan::MessageLabel", &message_label_tid));
+  }
+
+  if (gxf::has_component(context, *uid, message_label_tid, "message_label")) {
+    auto gxf_entity = nvidia::gxf::Entity::Shared(context, *uid);
+    auto buffer = gxf_entity.value().get<MessageLabel>("message_label");
+    MessageLabel m = *(buffer.value());
+
+    // Create a new operator timestamp with only receive timestamp
+    OperatorTimestampLabel cur_op_timestamp(op->qualified_name());
+    // Find whether current operator is already in the paths of message label m
+    auto cyclic_path_indices = m.has_operator(op->qualified_name());
+    if (cyclic_path_indices.empty()) {  // No cyclic paths
+      m.add_new_op_timestamp(cur_op_timestamp);
+      op->update_input_message_label(receiver_name, m);
+    } else {
+      // Update the publish timestamp of current operator where the cycle ends, to be the same as
+      // the receive timestamp. For cycles, we don't want to include the last operator's
+      // execution time. And the end-to-end latency for cycles is the difference of the start of
+      // the first operator and the *start* of the last operator. For others, the end-to-end
+      // latency is the start of the first operator and the *end* of the last operator.
+      cur_op_timestamp.pub_timestamp = cur_op_timestamp.rec_timestamp;
+      m.add_new_op_timestamp(cur_op_timestamp);
+      MessageLabel label_wo_cycles;
+      // For each cyclic path in m, update the flow tracker
+      // For all non-cyclic paths, add to the label_wo_cycles
+      int cycle_index = 0;
+      for (int i = 0; i < m.num_paths(); i++) {
+        if (cycle_index < (int)cyclic_path_indices.size() &&
+            i == cyclic_path_indices[cycle_index]) {
+          // Update flow tracker here for cyclic paths
+          op->fragment()->data_flow_tracker()->update_latency(m.get_path_name(i),
+                                                              m.get_e2e_latency_ms(i));
+          op->fragment()->data_flow_tracker()->write_to_logfile(
+              MessageLabel::to_string(m.get_path(i)));
+          cycle_index++;
+        } else {
+          // For non-cyclic paths, prepare the label_wo_cycles to propagate to the next operator
+          label_wo_cycles.add_new_path(m.get_path(i));
+        }
+      }
+      if (!label_wo_cycles.num_paths()) {
+        // Since there are no paths in label_wo_cycles, add the current operator in a new path
+        label_wo_cycles.add_new_op_timestamp(cur_op_timestamp);
+      }
+      op->update_input_message_label(receiver_name, label_wo_cycles);
+    }
+  } else {
+    HOLOSCAN_LOG_DEBUG("{} - {} - No message label found", op->qualified_name(), receiver_name);
+    op->delete_input_message_label(receiver_name);
+    return GXF_FAILURE;
+  }
+  return GXF_SUCCESS;
+}
+
+}  // namespace holoscan
diff --git a/src/core/fragment.cpp b/src/core/fragment.cpp
index c22386ec..c72c3a68 100644
--- a/src/core/fragment.cpp
+++ b/src/core/fragment.cpp
@@ -73,7 +73,6 @@ Application* Fragment::application() const {
 }
 
 void Fragment::config(const std::string& config_file, const std::string& prefix) {
-  (void)prefix;  // prefix is used for from_config() method.
   if (config_) { HOLOSCAN_LOG_WARN("Config object was already created. Overwriting..."); }
 
   // If the application is executed with `--config` option or HOLOSCAN_CONFIG_PATH environment,
diff --git a/src/core/gxf/gxf_component.cpp b/src/core/gxf/gxf_component.cpp
index a928a9fd..22bcff67 100644
--- a/src/core/gxf/gxf_component.cpp
+++ b/src/core/gxf/gxf_component.cpp
@@ -114,7 +114,7 @@ void GXFComponent::gxf_initialize() {
     gxf_component_ = handle;
     gxf_cid_ = handle->cid();
   } else {
-    // TODO: make sure all components always get initialized via GraphEntity so we can
+    // TODO(unknown): make sure all components always get initialized via GraphEntity so we can
     //       remove this code path. Some cases such as passing Arg of type Condition or
     //       Resource to make_operator will currently still use this code path.
     HOLOSCAN_LOG_TRACE(
@@ -123,7 +123,7 @@ void GXFComponent::gxf_initialize() {
         GxfComponentAdd(gxf_context_, gxf_eid_, gxf_tid_, gxf_cname().c_str(), &gxf_cid_));
   }
 
-  // TODO: replace gxf_cptr_ with Handle<Component>?
+  // TODO(unknown): replace gxf_cptr_ with Handle<Component>?
   HOLOSCAN_GXF_CALL(
       GxfComponentPointer(gxf_context_, gxf_cid_, gxf_tid_, reinterpret_cast<void**>(&gxf_cptr_)));
 }
@@ -139,7 +139,7 @@ void GXFComponent::set_gxf_parameter(const std::string& component_name, const st
                              "component '{}':: failed to set GXF parameter '{}'",
                              component_name,
                              key);
-  // TODO: handle error
+  // TODO(unknown): handle error
 }
 
 }  // namespace holoscan::gxf
diff --git a/src/core/gxf/gxf_resource.cpp b/src/core/gxf/gxf_resource.cpp
index 17c0d77c..b8a962d0 100644
--- a/src/core/gxf/gxf_resource.cpp
+++ b/src/core/gxf/gxf_resource.cpp
@@ -177,7 +177,10 @@ bool GXFResource::handle_dev_id(std::optional<int32_t>& dev_id_value) {
     if (dev_id_value.has_value()) {
       int32_t device_id = dev_id_value.value();
 
-      auto devices = gxf_graph_entity_->findAll<nvidia::gxf::GPUDevice>();
+      // Can just cap to a single device in findAll as we only want to check if there are any
+      // device resources in the entity. The default for the second template argument to findAll is
+      // kMaxComponents from gxf.h (1024 in GXF 4.1), so setting 1 reduces stack memory use.
+      auto devices = gxf_graph_entity_->findAll<nvidia::gxf::GPUDevice, 1>();
       if (devices.size() > 0) {
         HOLOSCAN_LOG_WARN("Existing entity already has a GPUDevice resource");
       }
@@ -198,7 +201,8 @@ bool GXFResource::handle_dev_id(std::optional<int32_t>& dev_id_value) {
       if (dev_handle.is_null()) {
         HOLOSCAN_LOG_ERROR("Failed to create GPUDevice for resource '{}'", name_);
       } else {
-        // TODO: warn and handle case if the resource was already in a different entity group
+        // TODO(unknown): warn and handle case if the resource was already in a different entity
+        // group
 
         // The GPUDevice and this resource have the same eid.
         // Make their eid is added to the newly created entity group.
diff --git a/src/core/gxf/gxf_utils.cpp b/src/core/gxf/gxf_utils.cpp
index 29c4fc8e..12f46638 100644
--- a/src/core/gxf/gxf_utils.cpp
+++ b/src/core/gxf/gxf_utils.cpp
@@ -41,7 +41,7 @@ std::string get_full_component_name(gxf_context_t context, gxf_uid_t cid) {
   gxf_uid_t eid;
   HOLOSCAN_GXF_CALL_FATAL(GxfComponentEntity(context, cid, &eid));
   const char* ename;
-  HOLOSCAN_GXF_CALL_FATAL(GxfComponentName(context, eid, &ename));
+  HOLOSCAN_GXF_CALL_FATAL(GxfEntityGetName(context, eid, &ename));
 
   std::stringstream sstream;
   sstream << ename << "/" << cname;
diff --git a/src/core/messagelabel.cpp b/src/core/messagelabel.cpp
index 3fc8c87d..2d5b7cd5 100644
--- a/src/core/messagelabel.cpp
+++ b/src/core/messagelabel.cpp
@@ -29,7 +29,7 @@ namespace holoscan {
 
 OperatorTimestampLabel& OperatorTimestampLabel::operator=(const OperatorTimestampLabel& o) {
   if (this != &o) {
-    this->operator_ptr = o.operator_ptr;
+    this->operator_name = o.operator_name;
     this->rec_timestamp = o.rec_timestamp;
     this->pub_timestamp = o.pub_timestamp;
   }
@@ -68,15 +68,11 @@ std::string MessageLabel::to_string() const {
 std::string MessageLabel::to_string(MessageLabel::TimestampedPath path) {
   auto msg_buf = fmt::memory_buffer();
   for (auto& it : path) {
-    if (!it.operator_ptr) {
-      HOLOSCAN_LOG_ERROR("MessageLabel::to_string - Operator pointer is null");
-    } else {
-      fmt::format_to(std::back_inserter(msg_buf),
-                     "({},{},{}) -> ",
-                     it.operator_ptr->name(),
-                     std::to_string(it.rec_timestamp),
-                     std::to_string(it.pub_timestamp));
-    }
+    fmt::format_to(std::back_inserter(msg_buf),
+                   "({},{},{}) -> ",
+                   it.operator_name,
+                   std::to_string(it.rec_timestamp),
+                   std::to_string(it.pub_timestamp));
   }
   msg_buf.resize(msg_buf.size() - 3);
   fmt::format_to(std::back_inserter(msg_buf), "\n");
@@ -100,7 +96,7 @@ void MessageLabel::add_new_op_timestamp(holoscan::OperatorTimestampLabel o_times
     message_paths[0].push_back(o_timestamp);
 
     PathOperators new_path_operators;
-    new_path_operators.insert(o_timestamp.operator_ptr->name());
+    new_path_operators.insert(o_timestamp.operator_name);
     message_path_operators.push_back(new_path_operators);
   } else {
     for (int i = 0; i < num_paths(); i++) {
@@ -110,7 +106,7 @@ void MessageLabel::add_new_op_timestamp(holoscan::OperatorTimestampLabel o_times
       message_paths[i].push_back(o_timestamp);
 
       // Add the new operator to the set of operators in the path
-      message_path_operators[i].insert(o_timestamp.operator_ptr->name());
+      message_path_operators[i].insert(o_timestamp.operator_name);
     }
   }
 }
@@ -122,7 +118,7 @@ void MessageLabel::update_last_op_publish() {
 void MessageLabel::add_new_path(MessageLabel::TimestampedPath path) {
   message_paths.push_back(path);
   PathOperators new_path_operators;
-  for (auto& op : path) { new_path_operators.insert(op.operator_ptr->name()); }
+  for (auto& op : path) { new_path_operators.insert(op.operator_name); }
   message_path_operators.push_back(new_path_operators);
 }
 
@@ -133,13 +129,7 @@ MessageLabel::TimestampedPath MessageLabel::get_path(int index) {
 std::string MessageLabel::get_path_name(int index) {
   auto pathstring = fmt::memory_buffer();
   for (auto& oplabel : message_paths[index]) {
-    if (!oplabel.operator_ptr) {
-      HOLOSCAN_LOG_ERROR(
-          "MessageLabel::get_path_name - Operator pointer is null. Path until now: {}.",
-          fmt::to_string(pathstring));
-    } else {
-      fmt::format_to(std::back_inserter(pathstring), "{},", oplabel.operator_ptr->name());
-    }
+    fmt::format_to(std::back_inserter(pathstring), "{},", oplabel.operator_name);
   }
   pathstring.resize(pathstring.size() - 1);
   return fmt::to_string(pathstring);
@@ -157,7 +147,7 @@ void MessageLabel::set_operator_rec_timestamp(int path_index, int op_index, int6
   message_paths[path_index][op_index].rec_timestamp = rec_timestamp;
 }
 
-std::vector<int> MessageLabel::has_operator(std::string op_name) {
+std::vector<int> MessageLabel::has_operator(const std::string& op_name) const {
   std::vector<int> valid_paths;
   valid_paths.reserve(DEFAULT_NUM_PATHS);
 
diff --git a/src/core/metadata.cpp b/src/core/metadata.cpp
index 6e69f5be..4668a399 100644
--- a/src/core/metadata.cpp
+++ b/src/core/metadata.cpp
@@ -76,7 +76,7 @@ void MetadataDictionary::set(const std::string& key, std::shared_ptr<MetadataObj
       return;
     }
   }
-  (*dictionary_)[key] = object;
+  (*dictionary_)[key] = std::move(object);
 }
 
 bool MetadataDictionary::has_key(const std::string& key) const {
diff --git a/src/core/network_contexts/gxf/ucx_context.cpp b/src/core/network_contexts/gxf/ucx_context.cpp
index d243434a..35c7f5b5 100644
--- a/src/core/network_contexts/gxf/ucx_context.cpp
+++ b/src/core/network_contexts/gxf/ucx_context.cpp
@@ -31,8 +31,22 @@ void UcxContext::setup(ComponentSpec& spec) {
              "serializer",
              "Entity Serializer",
              "The entity serializer used by this network context.");
-
-  // TODO: implement OperatorSpec::resource for managing nvidia::gxf:Resource types
+  spec.param(reconnect_,
+             "reconnect",
+             "Reconnect",
+             "Try to reconnect if a connection is closed during run (default = true).",
+             true);
+  spec.param(cpu_data_only_,
+             "cpu_data_only",
+             "CPU data only",
+             "If true, the UCX context will only support transmission of CPU (host) data "
+             "(default = false).",
+             false);
+  spec.param(enable_async_,
+             "enable_async",
+             "enable async mode",
+             "Enable asynchronous transmit/receive.",
+             false);
   // spec.resource(gpu_device_, "Optional GPU device resource");
 }
 
diff --git a/src/core/operator.cpp b/src/core/operator.cpp
index 0ae727be..8b1e9794 100644
--- a/src/core/operator.cpp
+++ b/src/core/operator.cpp
@@ -68,6 +68,30 @@ bool Operator::is_leaf() {
   return fragment()->graph().is_leaf(op_shared_ptr);
 }
 
+bool Operator::is_all_operator_successor_virtual(OperatorNodeType op, OperatorGraph& graph) {
+  auto next_nodes = graph.get_next_nodes(op);
+  for (auto& next_node : next_nodes) {
+    if (next_node->operator_type() != Operator::OperatorType::kVirtual) { return false; }
+  }
+  return true;
+}
+
+bool Operator::is_all_operator_predecessor_virtual(OperatorNodeType op, OperatorGraph& graph) {
+  auto prev_nodes = graph.get_previous_nodes(op);
+  for (auto& prev_node : prev_nodes) {
+    if (prev_node->operator_type() != Operator::OperatorType::kVirtual) { return false; }
+  }
+  return true;
+}
+
+std::string Operator::qualified_name() {
+  if (!this->fragment()->name().empty()) {
+    return fmt::format("{}.{}", this->fragment()->name(), name());
+  } else {
+    return name();
+  }
+}
+
 std::pair<std::string, std::string> Operator::parse_port_name(const std::string& op_port_name) {
   auto pos = op_port_name.find('.');
   if (pos == std::string::npos) { return std::make_pair(op_port_name, ""); }
@@ -120,7 +144,8 @@ holoscan::MessageLabel Operator::get_consolidated_input_label() {
                               1000;
 
       // Set the receive timestamp for the root operator
-      OperatorTimestampLabel new_op_label(this, get_current_time_us() - cur_exec_time, -1);
+      OperatorTimestampLabel new_op_label(
+          this->qualified_name(), get_current_time_us() - cur_exec_time, -1);
 
       m.add_new_op_timestamp(new_op_label);
     } else {
diff --git a/src/core/resources/gxf/annotated_double_buffer_receiver.cpp b/src/core/resources/gxf/annotated_double_buffer_receiver.cpp
index 7fd6ec8a..1583cb66 100644
--- a/src/core/resources/gxf/annotated_double_buffer_receiver.cpp
+++ b/src/core/resources/gxf/annotated_double_buffer_receiver.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "holoscan/core/resources/gxf/annotated_double_buffer_receiver.hpp"
+#include "holoscan/core/flow_tracking_annotation.hpp"
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_utils.hpp"
 #include "holoscan/core/message.hpp"
@@ -30,63 +31,7 @@ namespace holoscan {
 gxf_result_t AnnotatedDoubleBufferReceiver::receive_abi(gxf_uid_t* uid) {
   gxf_result_t code = nvidia::gxf::DoubleBufferReceiver::receive_abi(uid);
 
-  static gxf_tid_t message_label_tid = GxfTidNull();
-  if (message_label_tid == GxfTidNull()) {
-    HOLOSCAN_GXF_CALL(GxfComponentTypeId(context(), "holoscan::MessageLabel", &message_label_tid));
-  }
-
-  if (gxf::has_component(context(), *uid, message_label_tid, "message_label")) {
-    auto gxf_entity = nvidia::gxf::Entity::Shared(context(), *uid);
-    auto buffer = gxf_entity.value().get<MessageLabel>("message_label");
-    MessageLabel m = *(buffer.value());
-
-    if (!this->op()) {
-      HOLOSCAN_LOG_ERROR("AnnotatedDoubleBufferReceiver: {} - Operator* is nullptr", name());
-    } else {
-      // Create a new operator timestamp with only receive timestamp
-      OperatorTimestampLabel cur_op_timestamp(op());
-      // Find whether current operator is already in the paths of message label m
-      auto cyclic_path_indices = m.has_operator(op()->name());
-      if (cyclic_path_indices.empty()) {  // No cyclic paths
-        m.add_new_op_timestamp(cur_op_timestamp);
-        op()->update_input_message_label(name(), m);
-      } else {
-        // Update the publish timestamp of current operator where the cycle ends, to be the same as
-        // the receive timestamp. For cycles, we don't want to include the last operator's
-        // execution time. And the end-to-end latency for cycles is the difference of the start of
-        // the first operator and the *start* of the last operator. For others, the end-to-end
-        // latency is the start of the first operator and the *end* of the last operator.
-        cur_op_timestamp.pub_timestamp = cur_op_timestamp.rec_timestamp;
-        m.add_new_op_timestamp(cur_op_timestamp);
-        MessageLabel label_wo_cycles;
-        // For each cyclic path in m, update the flow tracker
-        // For all non-cyclic paths, add to the label_wo_cycles
-        int cycle_index = 0;
-        for (int i = 0; i < m.num_paths(); i++) {
-          if (cycle_index < (int)cyclic_path_indices.size() &&
-              i == cyclic_path_indices[cycle_index]) {
-            // Update flow tracker here for cyclic paths
-            op()->fragment()->data_flow_tracker()->update_latency(m.get_path_name(i),
-                                                                  m.get_e2e_latency_ms(i));
-            op()->fragment()->data_flow_tracker()->write_to_logfile(
-                MessageLabel::to_string(m.get_path(i)));
-            cycle_index++;
-          } else {
-            // For non-cyclic paths, prepare the label_wo_cycles to propagate to the next operator
-            label_wo_cycles.add_new_path(m.get_path(i));
-          }
-        }
-        if (!label_wo_cycles.num_paths()) {
-          // Since there are no paths in label_wo_cycles, add the current operator in a new path
-          label_wo_cycles.add_new_op_timestamp(cur_op_timestamp);
-        }
-        op()->update_input_message_label(name(), label_wo_cycles);
-      }
-    }
-  } else {
-    HOLOSCAN_LOG_DEBUG("AnnotatedDoubleBufferReceiver: {} - No message label found", name());
-    op()->delete_input_message_label(name());
-  }
+  deannotate_message(uid, context(), op(), name());
 
   return code;
 }
diff --git a/src/core/resources/gxf/annotated_double_buffer_transmitter.cpp b/src/core/resources/gxf/annotated_double_buffer_transmitter.cpp
index d93bbcdd..5b7dc6b3 100644
--- a/src/core/resources/gxf/annotated_double_buffer_transmitter.cpp
+++ b/src/core/resources/gxf/annotated_double_buffer_transmitter.cpp
@@ -16,8 +16,13 @@
  */
 
 #include "holoscan/core/resources/gxf/annotated_double_buffer_transmitter.hpp"
+
 #include <gxf/core/gxf.h>
-#include "holoscan/core/message.hpp"
+
+#include <memory>
+#include <utility>
+
+#include "holoscan/core/fragment.hpp"
 #include "holoscan/core/messagelabel.hpp"
 #include "holoscan/core/operator.hpp"
 #include "holoscan/logger/logger.hpp"
@@ -25,55 +30,27 @@
 namespace holoscan {
 
 gxf_result_t AnnotatedDoubleBufferTransmitter::publish_abi(gxf_uid_t uid) {
-  if (!this->op()) {
-    HOLOSCAN_LOG_ERROR("Operator is nullptr.");
-    return GXF_FAILURE;
-  } else {
-    auto gxf_entity = nvidia::gxf::Entity::Shared(context(), uid);
-    gxf_entity->deactivate();  // GXF Entity might be activated by the caller; so deactivate it to
-                               // add MessageLabel
-    MessageLabel m;
-    m = op()->get_consolidated_input_label();
-    m.update_last_op_publish();
-
-    // Check if a message_label component already exists in the entity
-    static gxf_tid_t message_label_tid = GxfTidNull();
-    if (message_label_tid == GxfTidNull()) {
-      GxfComponentTypeId(context(), "holoscan::MessageLabel", &message_label_tid);
-    }
-    // If a message_label component already exists in the entity, just update the value of the
-    // MessageLabel
-    if (gxf::has_component(context(), uid, message_label_tid, "message_label")) {
-      HOLOSCAN_LOG_DEBUG(
-          "Found a message label already inside the entity. Replacing the original with a new "
-          "one with timestamp.");
-      auto maybe_buffer = gxf_entity.value().get<MessageLabel>("message_label");
-      if (!maybe_buffer) {
-        // Fail early if we cannot add the MessageLabel
-        HOLOSCAN_LOG_ERROR(GxfResultStr(maybe_buffer.error()));
-        return maybe_buffer.error();
-      }
-      *maybe_buffer.value() = m;
-    } else {  // if no message_label component exists in the entity, add a new one
-      auto maybe_buffer = gxf_entity.value().add<MessageLabel>("message_label");
-      if (!maybe_buffer) {
-        // Fail early if we cannot add the MessageLabel
-        HOLOSCAN_LOG_ERROR(GxfResultStr(maybe_buffer.error()));
-        return maybe_buffer.error();
-      }
-      *maybe_buffer.value() = m;
-    }
-
-    // We do not activate the GXF Entity because these message entities are not supposed to be
-    // activated by default.
+  auto code = annotate_message(uid, context(), op(), name());
+  if (code != GXF_SUCCESS) {
+    HOLOSCAN_LOG_ERROR("Failed to annotate message");
+    return code;
   }
 
   // Call the Base class' publish_abi now
-  gxf_result_t code = nvidia::gxf::DoubleBufferTransmitter::publish_abi(uid);
+  code = nvidia::gxf::DoubleBufferTransmitter::publish_abi(uid);
+
+  // Check whether the associated operator is a root operator for the first time.
+  if (is_op_root_ == -1) {
+    std::shared_ptr<holoscan::Operator> op_shared_ptr(op(), [](Operator*) {});
+    is_op_root_ = op()->is_root() || op()->is_user_defined_root() ||
+                  Operator::is_all_operator_predecessor_virtual(std::move(op_shared_ptr),
+                                                                op()->fragment()->graph());
+  }
 
-  if (op()->is_root() || op()->is_user_defined_root()) {
+  // After the first time, only update number of published messages for a root operator only.
+  if (is_op_root_) {
     if (!op_transmitter_name_pair_.size())
-      op_transmitter_name_pair_ = fmt::format("{}->{}", op()->name(), name());
+      op_transmitter_name_pair_ = fmt::format("{}->{}", op()->qualified_name(), name());
     op()->update_published_messages(op_transmitter_name_pair_);
   }
 
diff --git a/src/core/resources/gxf/block_memory_pool.cpp b/src/core/resources/gxf/block_memory_pool.cpp
index c38c4338..b2804f05 100644
--- a/src/core/resources/gxf/block_memory_pool.cpp
+++ b/src/core/resources/gxf/block_memory_pool.cpp
@@ -86,7 +86,7 @@ nvidia::gxf::MemoryStorageType BlockMemoryPool::storage_type() const {
   if (pool) {
     return pool->storage_type();
   } else {
-    // TODO: throw error or return Unexpected?
+    // TODO(unknown): throw error or return Unexpected?
     HOLOSCAN_LOG_ERROR("BlockMemoryPool component not yet registered with GXF");
     return nvidia::gxf::MemoryStorageType::kSystem;
   }
diff --git a/src/core/resources/gxf/cuda_allocator.cpp b/src/core/resources/gxf/cuda_allocator.cpp
new file mode 100644
index 00000000..9613681a
--- /dev/null
+++ b/src/core/resources/gxf/cuda_allocator.cpp
@@ -0,0 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/resources/gxf/cuda_allocator.hpp"
+
+#include <string>
+
+namespace holoscan {
+
+CudaAllocator::CudaAllocator(const std::string& name, nvidia::gxf::CudaAllocator* component)
+    : Allocator(name, component) {}
+
+nvidia::gxf::CudaAllocator* CudaAllocator::get() const {
+  return static_cast<nvidia::gxf::CudaAllocator*>(gxf_cptr_);
+}
+
+nvidia::byte* CudaAllocator::allocate_async(uint64_t size, cudaStream_t stream) {
+  auto allocator = get();
+  if (allocator) {
+    auto result = allocator->allocate_async(size, stream);
+    if (result) { return result.value(); }
+  }
+
+  HOLOSCAN_LOG_ERROR("Failed to asynchronously allocate memory of size {}", size);
+
+  return nullptr;
+}
+
+void CudaAllocator::free_async(nvidia::byte* pointer, cudaStream_t stream) {
+  auto allocator = get();
+  if (allocator) {
+    auto result = allocator->free_async(pointer, stream);
+    if (!result) {
+      HOLOSCAN_LOG_ERROR("Failed to asynchronously free memory at {}", static_cast<void*>(pointer));
+    }
+  }
+}
+
+size_t CudaAllocator::pool_size(MemoryStorageType type) const {
+  auto allocator = get();
+  if (!allocator) { throw std::runtime_error("null GXF component pointer"); }
+  auto maybe_size = allocator->get_pool_size(static_cast<nvidia::gxf::MemoryStorageType>(type));
+  if (!maybe_size) { throw std::runtime_error("failed to get pool size"); }
+  return maybe_size.value();
+}
+
+}  // namespace holoscan
diff --git a/src/core/resources/gxf/cuda_stream_pool.cpp b/src/core/resources/gxf/cuda_stream_pool.cpp
index 58e7c367..873e5417 100644
--- a/src/core/resources/gxf/cuda_stream_pool.cpp
+++ b/src/core/resources/gxf/cuda_stream_pool.cpp
@@ -64,9 +64,9 @@ nvidia::gxf::CudaStreamPool* CudaStreamPool::get() const {
 }
 
 void CudaStreamPool::setup(ComponentSpec& spec) {
-  // TODO: The dev_id parameter was removed in GXF 3.0 and replaced with a GPUDevice Resource
-  // Note: We are currently working around this with special handling of the "dev_id" parameter
-  // in GXFResource::initialize().
+  // TODO(unknown): The dev_id parameter was removed in GXF 3.0 and replaced with a GPUDevice
+  // Resource Note: We are currently working around this with special handling of the "dev_id"
+  // parameter in GXFResource::initialize().
   spec.param(
       dev_id_, "dev_id", "Device Id", "Create CUDA Stream on which device.", kDefaultDeviceId);
   spec.param(stream_flags_,
diff --git a/src/core/resources/gxf/dfft_collector.cpp b/src/core/resources/gxf/dfft_collector.cpp
index 0760d115..8de63c8a 100644
--- a/src/core/resources/gxf/dfft_collector.cpp
+++ b/src/core/resources/gxf/dfft_collector.cpp
@@ -15,13 +15,14 @@
  * limitations under the License.
  */
 
+#include "holoscan/core/resources/gxf/dfft_collector.hpp"
+
 #include <iostream>
+#include <utility>
 
 #include "gxf/std/clock.hpp"
 #include "gxf/std/codelet.hpp"
-
 #include "holoscan/core/operator.hpp"
-#include "holoscan/core/resources/gxf/dfft_collector.hpp"
 #include "holoscan/logger/logger.hpp"
 
 namespace holoscan {
@@ -55,7 +56,7 @@ gxf_result_t DFFTCollector::on_execute_abi(gxf_uid_t eid, uint64_t timestamp, gx
   if (leaf_ops_.find(codelet_id) != leaf_ops_.end() &&
       codelet.value()->getExecutionCount() > leaf_last_execution_count_[codelet_id]) {
     leaf_last_execution_count_[codelet_id] = codelet.value()->getExecutionCount();
-    MessageLabel m = leaf_ops_[codelet_id]->get_consolidated_input_label();
+    MessageLabel m = std::move(leaf_ops_[codelet_id]->get_consolidated_input_label());
     leaf_ops_[codelet_id]->reset_input_message_labels();
 
     if (m.num_paths()) {
@@ -66,8 +67,9 @@ gxf_result_t DFFTCollector::on_execute_abi(gxf_uid_t eid, uint64_t timestamp, gx
       }
       data_flow_tracker_->write_to_logfile(m.to_string());
     }
-
-  } else if (root_ops_.find(codelet_id) != root_ops_.end()) {
+  }
+  // leaf can also be root, especially for distributed app
+  if (root_ops_.find(codelet_id) != root_ops_.end()) {
     holoscan::Operator* cur_op = root_ops_[codelet_id];
     for (auto& it : cur_op->num_published_messages_map()) {
       data_flow_tracker_->update_source_messages_number(it.first, it.second);
diff --git a/src/core/resources/gxf/holoscan_ucx_receiver.cpp b/src/core/resources/gxf/holoscan_ucx_receiver.cpp
new file mode 100644
index 00000000..cda49cdf
--- /dev/null
+++ b/src/core/resources/gxf/holoscan_ucx_receiver.cpp
@@ -0,0 +1,32 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/resources/gxf/holoscan_ucx_receiver.hpp"
+#include "holoscan/core/flow_tracking_annotation.hpp"
+#include "holoscan/logger/logger.hpp"
+
+namespace holoscan {
+
+gxf_result_t HoloscanUcxReceiver::receive_abi(gxf_uid_t* uid) {
+  gxf_result_t code = nvidia::gxf::UcxReceiver::receive_abi(uid);
+
+  if (tracking_) { deannotate_message(uid, context(), op(), name()); }
+
+  return code;
+}
+
+}  // namespace holoscan
diff --git a/src/core/resources/gxf/holoscan_ucx_transmitter.cpp b/src/core/resources/gxf/holoscan_ucx_transmitter.cpp
new file mode 100644
index 00000000..ae9ce134
--- /dev/null
+++ b/src/core/resources/gxf/holoscan_ucx_transmitter.cpp
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include <utility>
+
+#include "holoscan/core/flow_tracking_annotation.hpp"
+#include "holoscan/core/fragment.hpp"
+#include "holoscan/core/resources/gxf/holoscan_ucx_transmitter.hpp"
+#include "holoscan/logger/logger.hpp"
+
+namespace holoscan {
+gxf_result_t HoloscanUcxTransmitter::publish_abi(gxf_uid_t uid) {
+  if (tracking_) {
+    auto code = annotate_message(uid, context(), op(), name());
+    if (code != GXF_SUCCESS) {
+      HOLOSCAN_LOG_ERROR("Failed to annotate message");
+      return code;
+    }
+  }
+
+  // Call the Base class' publish_abi now
+  auto code = nvidia::gxf::UcxTransmitter::publish_abi(uid);
+
+  if (tracking_) {
+    if (is_op_root == -1) {
+      std::shared_ptr<holoscan::Operator> op_shared_ptr(op(), [](Operator*) {});
+      is_op_root = op()->is_root() || op()->is_user_defined_root() ||
+                   Operator::is_all_operator_predecessor_virtual(std::move(op_shared_ptr),
+                                                                 op()->fragment()->graph());
+    }
+    if (is_op_root) {
+      if (!op_transmitter_name_pair_.size())
+        op_transmitter_name_pair_ = fmt::format("{}->{}", op()->qualified_name(), name());
+      op()->update_published_messages(op_transmitter_name_pair_);
+    }
+  }
+
+  return code;
+}
+}  // namespace holoscan
diff --git a/src/core/resources/gxf/rmm_allocator.cpp b/src/core/resources/gxf/rmm_allocator.cpp
new file mode 100644
index 00000000..a15b6f5e
--- /dev/null
+++ b/src/core/resources/gxf/rmm_allocator.cpp
@@ -0,0 +1,114 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/resources/gxf/rmm_allocator.hpp"
+
+#include <cstdint>
+#include <string>
+
+#include "gxf/std/resources.hpp"  // for GPUDevice
+
+#include "holoscan/core/component_spec.hpp"
+
+namespace holoscan {
+
+namespace {
+
+// kPoolInitialSize, kPoolMaxSize copied from rmm_allocator.cpp
+#ifdef __aarch64__
+constexpr const char* kPoolInitialSize = "8MB";  // 8 MB initial pool size
+constexpr const char* kPoolMaxSize = "16MB";
+#else
+constexpr const char* kPoolInitialSize = "16MB";  // 16 MB initial pool size
+constexpr const char* kPoolMaxSize = "32MB";
+#endif
+constexpr int32_t kDefaultDeviceId = 0;
+
+}  // namespace
+
+RMMAllocator::RMMAllocator(const std::string& name, nvidia::gxf::RMMAllocator* component)
+    : CudaAllocator(name, component) {
+  auto maybe_device_initial = component->getParameter<std::string>("device_memory_initial_size");
+  if (!maybe_device_initial) {
+    throw std::runtime_error("Failed to get device_memory_initial_size");
+  }
+  device_memory_initial_size_ = maybe_device_initial.value();
+
+  auto maybe_device_max = component->getParameter<std::string>("device_memory_max_size");
+  if (!maybe_device_max) { throw std::runtime_error("Failed to get device_memory_max_size"); }
+  device_memory_max_size_ = maybe_device_max.value();
+
+  auto maybe_host_initial = component->getParameter<std::string>("host_memory_initial_size");
+  if (!maybe_host_initial) { throw std::runtime_error("Failed to get host_memory_initial_size"); }
+  host_memory_initial_size_ = maybe_host_initial.value();
+
+  auto maybe_host_max = component->getParameter<std::string>("host_memory_max_size");
+  if (!maybe_host_max) { throw std::runtime_error("Failed to get host_memory_max_size"); }
+  host_memory_max_size_ = maybe_host_max.value();
+
+  auto maybe_gpu_device =
+      component->getParameter<nvidia::gxf::Handle<nvidia::gxf::GPUDevice>>("dev_id");
+  if (!maybe_gpu_device) { throw std::runtime_error("Failed to get dev_id"); }
+  auto gpu_device_handle = maybe_gpu_device.value();
+  dev_id_ = gpu_device_handle->device_id();
+}
+
+nvidia::gxf::RMMAllocator* RMMAllocator::get() const {
+  return static_cast<nvidia::gxf::RMMAllocator*>(gxf_cptr_);
+}
+
+void RMMAllocator::setup(ComponentSpec& spec) {
+  spec.param(device_memory_initial_size_,
+             "device_memory_initial_size",
+             "Device Memory Pool Initial Size",
+             "The initial memory pool size used by this device. Examples of valid values are "
+             "'512MB', '256 KB', '1 GB'. The format is a non-negative integer value followed by "
+             "an optional space and then a suffix representing the units. Supported units are "
+             "B, KB, MB, GB and TB where the values are powers of 1024 bytes.",
+             std::string(kPoolInitialSize));
+  spec.param(device_memory_max_size_,
+             "device_memory_max_size",
+             "Device Memory Pool Maximum Size",
+             "The max memory pool size used by this device. Examples of valid values are "
+             "'512MB', '256 KB', '1 GB'. The format is a non-negative integer value followed by "
+             "an optional space and then a suffix representing the units. Supported units are "
+             "B, KB, MB, GB and TB where the values are powers of 1024 bytes.",
+             std::string(kPoolMaxSize));
+  spec.param(host_memory_initial_size_,
+             "host_memory_initial_size",
+             "Host Memory Pool Initial Size",
+             "The initial memory pool size used by the host. Examples of valid values are "
+             "'512MB', '256 KB', '1 GB'. The format is a non-negative integer value followed by "
+             "an optional space and then a suffix representing the units. Supported units are "
+             "B, KB, MB, GB and TB where the values are powers of 1024 bytes.",
+             std::string(kPoolInitialSize));
+  spec.param(host_memory_max_size_,
+             "host_memory_max_size",
+             "Host Memory Pool Maximum Size",
+             "The max memory pool size used by the host. Examples of valid values are "
+             "'512MB', '256 KB', '1 GB'. The format is a non-negative integer value followed by "
+             "an optional space and then a suffix representing the units. Supported units are "
+             "B, KB, MB, GB and TB where the values are powers of 1024 bytes.",
+             std::string(kPoolMaxSize));
+  spec.param(dev_id_,
+             "dev_id",
+             "Device Id",
+             "Device on which to create the memory pool.",
+             kDefaultDeviceId);
+}
+
+}  // namespace holoscan
diff --git a/src/core/resources/gxf/stream_ordered_allocator.cpp b/src/core/resources/gxf/stream_ordered_allocator.cpp
new file mode 100644
index 00000000..cbf5fcd5
--- /dev/null
+++ b/src/core/resources/gxf/stream_ordered_allocator.cpp
@@ -0,0 +1,104 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "holoscan/core/resources/gxf/stream_ordered_allocator.hpp"
+
+#include <string>
+
+#include "gxf/std/resources.hpp"  // for GPUDevice
+
+#include "holoscan/core/component_spec.hpp"
+
+namespace holoscan {
+
+namespace {
+// default values copied from gxf/cuda/stream_ordered_allocator.cpp
+#ifdef __aarch64__
+constexpr const char* kPoolInitialSize = "8MB";  // 8 MB initial pool size
+constexpr const char* kPoolMaxSize = "16MB";
+#else
+constexpr const char* kPoolInitialSize = "16MB";  // 16 MB initial pool size
+constexpr const char* kPoolMaxSize = "32MB";
+#endif
+constexpr const char* kReleaseThreshold = "4MB";  // 4MB release threshold
+constexpr int32_t kDefaultDeviceId = 0;
+
+}  // namespace
+
+StreamOrderedAllocator::StreamOrderedAllocator(const std::string& name,
+                                               nvidia::gxf::StreamOrderedAllocator* component)
+    : CudaAllocator(name, component) {
+  auto maybe_device_initial = component->getParameter<std::string>("device_memory_initial_size");
+  if (!maybe_device_initial) {
+    throw std::runtime_error("Failed to get device_memory_initial_size");
+  }
+  device_memory_initial_size_ = maybe_device_initial.value();
+
+  auto maybe_device_max = component->getParameter<std::string>("device_memory_max_size");
+  if (!maybe_device_max) { throw std::runtime_error("Failed to get device_memory_max_size"); }
+  device_memory_max_size_ = maybe_device_max.value();
+
+  auto maybe_release_threshold = component->getParameter<std::string>("release_threshold");
+  if (!maybe_release_threshold) { throw std::runtime_error("Failed to get release_threshold"); }
+  release_threshold_ = maybe_release_threshold.value();
+
+  auto maybe_gpu_device =
+      component->getParameter<nvidia::gxf::Handle<nvidia::gxf::GPUDevice>>("dev_id");
+  if (!maybe_gpu_device) { throw std::runtime_error("Failed to get dev_id"); }
+  auto gpu_device_handle = maybe_gpu_device.value();
+  dev_id_ = gpu_device_handle->device_id();
+}
+
+void StreamOrderedAllocator::setup(ComponentSpec& spec) {
+  spec.param(device_memory_initial_size_,
+             "device_memory_initial_size",
+             "Device Memory Pool Initial Size",
+             "The initial memory pool size used by this device. Examples of valid values are "
+             "'512MB', '256 KB', '1 GB'. The format is a non-negative integer value followed by "
+             "an optional space and then a suffix representing the units. Supported units are "
+             "B, KB, MB, GB and TB where the values are powers of 1024 bytes.",
+             std::string(kPoolInitialSize));
+  spec.param(device_memory_max_size_,
+             "device_memory_max_size",
+             "Device Memory Pool Maximum Size",
+             "The max memory pool size used by this device. Examples of valid values are "
+             "'512MB', '256 KB', '1 GB'. The format is a non-negative integer value followed by "
+             "an optional space and then a suffix representing the units. Supported units are "
+             "B, KB, MB, GB and TB where the values are powers of 1024 bytes.",
+             std::string(kPoolMaxSize));
+  spec.param(release_threshold_,
+             "release_threadhold",
+             "Amount of reserved memory to hold onto before trying to release memory back "
+             "to the OS",
+             "The release threshold specifies the maximum amount of memory the pool caches. "
+             "Examples of valid values are '512MB', '256 KB', '1 GB'. The format is a "
+             "non-negative integer value followed by an optional space and then a suffix "
+             "representing the units. Supported units are B, KB, MB, GB and TB where the values "
+             "are powers of 1024 bytes.",
+             std::string(kReleaseThreshold));
+  spec.param(dev_id_,
+             "dev_id",
+             "Device Id",
+             "Device on which to create the memory pool.",
+             static_cast<int32_t>(0));
+}
+
+nvidia::gxf::StreamOrderedAllocator* StreamOrderedAllocator::get() const {
+  return static_cast<nvidia::gxf::StreamOrderedAllocator*>(gxf_cptr_);
+}
+
+}  // namespace holoscan
diff --git a/src/core/resources/gxf/ucx_receiver.cpp b/src/core/resources/gxf/ucx_receiver.cpp
index 6476c0b0..4a8090a8 100644
--- a/src/core/resources/gxf/ucx_receiver.cpp
+++ b/src/core/resources/gxf/ucx_receiver.cpp
@@ -25,6 +25,7 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 #include "holoscan/core/gxf/gxf_utils.hpp"
+#include "holoscan/core/resources/gxf/holoscan_ucx_receiver.hpp"
 #include "holoscan/core/resources/gxf/ucx_serialization_buffer.hpp"
 
 namespace holoscan {
@@ -65,7 +66,7 @@ void UcxReceiver::setup(ComponentSpec& spec) {
   spec.param(port_, "port", "rx_port", "RX port", kDefaultUcxPort);
   spec.param(buffer_, "buffer", "Serialization Buffer", "");
 
-  // TODO: implement OperatorSpec::resource for managing nvidia::gxf:Resource types
+  // TODO(unknown): implement OperatorSpec::resource for managing nvidia::gxf:Resource types
   // spec.resource(gpu_device_, "Optional GPU device resource");
 }
 
@@ -100,4 +101,9 @@ uint32_t UcxReceiver::port() {
   return port_.get();
 }
 
+void UcxReceiver::track() {
+  auto receiver_ptr = static_cast<holoscan::HoloscanUcxReceiver*>(gxf_cptr_);
+  receiver_ptr->track();
+}
+
 }  // namespace holoscan
diff --git a/src/core/resources/gxf/ucx_transmitter.cpp b/src/core/resources/gxf/ucx_transmitter.cpp
index af733956..1471984d 100644
--- a/src/core/resources/gxf/ucx_transmitter.cpp
+++ b/src/core/resources/gxf/ucx_transmitter.cpp
@@ -25,6 +25,7 @@
 #include "holoscan/core/fragment.hpp"
 #include "holoscan/core/gxf/gxf_resource.hpp"
 #include "holoscan/core/gxf/gxf_utils.hpp"
+#include "holoscan/core/resources/gxf/holoscan_ucx_transmitter.hpp"
 #include "holoscan/core/resources/gxf/ucx_receiver.hpp"  // for kDefaultUcxPort
 #include "holoscan/core/resources/gxf/ucx_serialization_buffer.hpp"
 
@@ -98,7 +99,7 @@ void UcxTransmitter::setup(ComponentSpec& spec) {
 
   spec.param(buffer_, "buffer", "Serialization Buffer", "");
 
-  // TODO: implement OperatorSpec::resource for managing nvidia::gxf:Resource types
+  // TODO(unknown): implement OperatorSpec::resource for managing nvidia::gxf:Resource types
   // spec.resource(gpu_device_, "Optional GPU device resource");
 }
 
@@ -141,4 +142,11 @@ uint32_t UcxTransmitter::local_port() {
   return local_port_.get();
 }
 
+void UcxTransmitter::track() {
+  auto transmitter_ptr = static_cast<holoscan::HoloscanUcxTransmitter*>(gxf_cptr_);
+  if (transmitter_ptr) transmitter_ptr->track();
+  else
+    HOLOSCAN_LOG_ERROR("Failed to track UcxTransmitter");
+}
+
 }  // namespace holoscan
diff --git a/src/core/services/common/forward_op.cpp b/src/core/services/common/forward_op.cpp
index d13e803e..05646ebf 100644
--- a/src/core/services/common/forward_op.cpp
+++ b/src/core/services/common/forward_op.cpp
@@ -26,7 +26,8 @@ void ForwardOp::setup(OperatorSpec& spec) {
   spec.output<std::any>("out");
 }
 
-void ForwardOp::compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) {
+void ForwardOp::compute(InputContext& op_input, OutputContext& op_output,
+                        [[maybe_unused]] ExecutionContext& context) {
   auto in_message = op_input.receive<std::any>("in");
   if (in_message) {
     auto value = in_message.value();
diff --git a/src/core/system/cpu_resource_monitor.cpp b/src/core/system/cpu_resource_monitor.cpp
index 634b2398..4cf54025 100644
--- a/src/core/system/cpu_resource_monitor.cpp
+++ b/src/core/system/cpu_resource_monitor.cpp
@@ -167,7 +167,7 @@ CPUInfo& CPUResourceMonitor::update(CPUInfo& cpu_info, uint64_t metric_flags) {
       if (idle_diff > 0 && total_diff > 0) {
         cpu_info.cpu_usage = static_cast<float>(1.0 - (static_cast<double>(idle_diff) /
                                                        static_cast<double>(total_diff))) *
-                             100.0f;
+                             100.0F;
       }
 
       // Update the last total stats
@@ -188,7 +188,7 @@ CPUInfo& CPUResourceMonitor::update(CPUInfo& cpu_info, uint64_t metric_flags) {
           fmt::format("Invalid cpu_info.memory_total value: {}", memory_total));
     }
     cpu_info.memory_usage =
-        static_cast<float>(1.0 - (static_cast<double>(mem_info[2]) / memory_total)) * 100.0f;
+        static_cast<float>(1.0 - (static_cast<double>(mem_info[2]) / memory_total)) * 100.0F;
   }
 
   if (metric_flags & CPUMetricFlag::SHARED_MEMORY_USAGE) {
@@ -200,7 +200,7 @@ CPUInfo& CPUResourceMonitor::update(CPUInfo& cpu_info, uint64_t metric_flags) {
     cpu_info.shared_memory_available = shm_info[2];
     cpu_info.shared_memory_usage = static_cast<float>(1.0 - (static_cast<double>(shm_info[2]) /
                                                              static_cast<double>(shm_info[0]))) *
-                                   100.0f;
+                                   100.0F;
   }
 
   return cpu_info;
diff --git a/src/core/system/gpu_resource_monitor.cpp b/src/core/system/gpu_resource_monitor.cpp
index fe7fdb1f..85b70d97 100644
--- a/src/core/system/gpu_resource_monitor.cpp
+++ b/src/core/system/gpu_resource_monitor.cpp
@@ -145,6 +145,7 @@ GPUResourceMonitor::GPUResourceMonitor(uint64_t metric_flags) : metric_flags_(me
 }
 
 GPUResourceMonitor::~GPUResourceMonitor() {
+  // suppress potential exceptions from logging within close()
   close();
 }
 
@@ -256,7 +257,7 @@ GPUInfo& GPUResourceMonitor::update(uint32_t index, GPUInfo& gpu_info, uint64_t
       gpu_info.memory_total = memory.total;
       gpu_info.memory_free = memory.free;
       gpu_info.memory_used = memory.used;
-      gpu_info.memory_usage = memory.total ? 100.0 * memory.used / memory.total : 0.0f;
+      gpu_info.memory_usage = memory.total ? 100.0 * memory.used / memory.total : 0.0F;
     }
 
     if (metric_flags & GPUMetricFlag::POWER_LIMIT) {
@@ -539,7 +540,10 @@ bool GPUResourceMonitor::init_nvml() {
   HOLOSCAN_NVML_CALL_RETURN_VALUE_MSG(nvmlInit(), false, "Could not initialize NVML");
 
   // Get the GPU count and initialize the GPU info vector
-  HOLOSCAN_NVML_CALL(nvmlDeviceGetCount(&gpu_count_));
+  if (HOLOSCAN_NVML_CALL_WARN(nvmlDeviceGetCount(&gpu_count_)) != 0) {
+    HOLOSCAN_LOG_ERROR("Could not get the number of GPUs");
+    gpu_count_ = 0;
+  }
 
   // Initialize nvml devices vector
   nvml_devices_.resize(gpu_count_, nullptr);
@@ -598,8 +602,13 @@ bool GPUResourceMonitor::init_cuda_runtime() {
   HOLOSCAN_LOG_DEBUG("CUDA Runtime API library loaded from '{}'", libcudart_path);
   bind_cuda_runtime_methods();
   int gpu_count = 0;
-  HOLOSCAN_CUDA_CALL_RETURN_VALUE_MSG(
-      cudaGetDeviceCount(&gpu_count), false, "Could not get the number of GPUs");
+  auto holoscan_cuda_err = HOLOSCAN_CUDA_CALL_CHECK_HANDLE(cudaGetDeviceCount(&gpu_count));
+  if (holoscan_cuda_err != 0) {
+    HOLOSCAN_LOG_WARN("Could not get the number of GPUs");
+    shutdown_cuda_runtime();
+    gpu_count_ = 0;
+    return false;
+  }
   gpu_count_ = gpu_count;
 
   return true;
@@ -609,7 +618,13 @@ void GPUResourceMonitor::shutdown_nvml() noexcept {
   if (handle_) {
     if (nvmlShutdown) {
       nvml::nvmlReturn_t result = nvmlShutdown();
-      if (result != 0) { HOLOSCAN_LOG_ERROR("Could not shutdown NVML"); }
+      if (result != 0) {
+        // ignore potential exception from logging
+        // (shutdown_nvml is called from ~GPUResourceMonitor)
+        try {
+          HOLOSCAN_LOG_ERROR("Could not shutdown NVML");
+        } catch (const std::exception& e) {}
+      }
     }
     dlclose(handle_);
     handle_ = nullptr;
diff --git a/src/operators/aja_source/CMakeLists.txt b/src/operators/aja_source/CMakeLists.txt
index b0ba29d5..bf3cd6ad 100644
--- a/src/operators/aja_source/CMakeLists.txt
+++ b/src/operators/aja_source/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,7 +15,7 @@
 
 add_holoscan_operator(aja aja_source.cpp)
 
-# TODO: try making AJA::ajantv2 private, need to remove headers from the operator headers
+# TODO(unknown): try making AJA::ajantv2 private, need to remove headers from the operator headers
 target_link_libraries(op_aja
     PUBLIC
         holoscan::core
diff --git a/src/operators/aja_source/aja_source.cpp b/src/operators/aja_source/aja_source.cpp
index bf6236d6..8b018656 100644
--- a/src/operators/aja_source/aja_source.cpp
+++ b/src/operators/aja_source/aja_source.cpp
@@ -450,18 +450,19 @@ void AJASourceOp::initialize() {
 
 void AJASourceOp::start() {
   // Determine whether or not we're using the iGPU.
-  // TODO: This assumes we're using the first GPU device (as does the rest of the operator).
+  // TODO(unknown): This assumes we're using the first GPU device (as does the rest of the
+  // operator).
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
   is_igpu_ = prop.integrated;
 
   float framerate;
   if (framerate_ == 23) {
-    framerate = 23.98f;
+    framerate = 23.98F;
   } else if (framerate_ == 29) {
-    framerate = 29.97f;
+    framerate = 29.97F;
   } else if (framerate_ == 59) {
-    framerate = 59.94f;
+    framerate = 59.94F;
   } else {
     framerate = framerate_;
   }
diff --git a/src/operators/async_ping_rx/async_ping_rx.cpp b/src/operators/async_ping_rx/async_ping_rx.cpp
index 3ac83bf2..6eae5e3e 100644
--- a/src/operators/async_ping_rx/async_ping_rx.cpp
+++ b/src/operators/async_ping_rx/async_ping_rx.cpp
@@ -71,7 +71,8 @@ void AsyncPingRxOp::start() {
   async_thread_ = std::thread([this] { async_ping(); });
 }
 
-void AsyncPingRxOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void AsyncPingRxOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                            [[maybe_unused]] ExecutionContext& context) {
   auto value = op_input.receive<int>("in").value();
   HOLOSCAN_LOG_INFO("Rx message value: {}", value);
 
diff --git a/src/operators/async_ping_tx/async_ping_tx.cpp b/src/operators/async_ping_tx/async_ping_tx.cpp
index 05314ad3..4833662a 100644
--- a/src/operators/async_ping_tx/async_ping_tx.cpp
+++ b/src/operators/async_ping_tx/async_ping_tx.cpp
@@ -73,7 +73,8 @@ void AsyncPingTxOp::start() {
   async_thread_ = std::thread([this] { async_ping(); });
 }
 
-void AsyncPingTxOp::compute(InputContext&, OutputContext& op_output, ExecutionContext&) {
+void AsyncPingTxOp::compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+                            [[maybe_unused]] ExecutionContext& context) {
   ++index_;
   if (index_ == count_) {
     // Reached max count of ticks
diff --git a/src/operators/format_converter/format_converter.cpp b/src/operators/format_converter/format_converter.cpp
index 4c001550..ef6fc939 100644
--- a/src/operators/format_converter/format_converter.cpp
+++ b/src/operators/format_converter/format_converter.cpp
@@ -962,8 +962,8 @@ void FormatConverterOp::setup(OperatorSpec& spec) {
              "Name of the output tensor.",
              std::string(""));
   spec.param(out_dtype_str_, "out_dtype", "OutputDataType", "Destination data type.");
-  spec.param(scale_min_, "scale_min", "Scale min", "Minimum value of the scale.", 0.f);
-  spec.param(scale_max_, "scale_max", "Scale max", "Maximum value of the scale.", 1.f);
+  spec.param(scale_min_, "scale_min", "Scale min", "Minimum value of the scale.", 0.F);
+  spec.param(scale_max_, "scale_max", "Scale max", "Maximum value of the scale.", 1.F);
   spec.param(alpha_value_,
              "alpha_value",
              "Alpha value",
diff --git a/src/operators/holoviz/holoviz.cpp b/src/operators/holoviz/holoviz.cpp
index 2546f1f8..08a2a97d 100644
--- a/src/operators/holoviz/holoviz.cpp
+++ b/src/operators/holoviz/holoviz.cpp
@@ -762,7 +762,7 @@ HolovizOp::InputSpec::InputSpec(const std::string& tensor_name, const std::strin
 void HolovizOp::setup(OperatorSpec& spec) {
   constexpr uint32_t DEFAULT_WIDTH = 1920;
   constexpr uint32_t DEFAULT_HEIGHT = 1080;
-  constexpr float DEFAULT_FRAMERATE = 60.f;
+  constexpr float DEFAULT_FRAMERATE = 60.F;
   static const std::string DEFAULT_WINDOW_TITLE("Holoviz");
   static const std::string DEFAULT_DISPLAY_NAME("");
   constexpr bool DEFAULT_EXCLUSIVE_DISPLAY = false;
@@ -899,13 +899,13 @@ void HolovizOp::setup(OperatorSpec& spec) {
              "Type of data output at `camera_pose_output`. Supported values are "
              "`projection_matrix` and `extrinsics_model`.",
              std::string("projection_matrix"));
-  spec.param(camera_eye_, "camera_eye", "Camera Eye", "Camera eye position", {{0.f, 0.f, 1.f}});
+  spec.param(camera_eye_, "camera_eye", "Camera Eye", "Camera eye position", {{0.F, 0.F, 1.F}});
   spec.param(camera_look_at_,
              "camera_look_at",
              "Camera Look At",
              "Camera look at position",
-             {{0.f, 0.f, 0.f}});
-  spec.param(camera_up_, "camera_up", "Camera Up", "Camera up vector", {{0.f, 1.f, 0.f}});
+             {{0.F, 0.F, 0.F}});
+  spec.param(camera_up_, "camera_up", "Camera Up", "Camera up vector", {{0.F, 1.F, 0.F}});
 
   spec.param(key_callback_,
              "key_callback",
@@ -1088,7 +1088,7 @@ void HolovizOp::set_input_spec_geometry(const InputSpec& input_spec) {
   set_input_spec(input_spec);
 
   // now set geometry layer specific properties
-  std::array<float, 4> color{1.f, 1.f, 1.f, 1.f};
+  std::array<float, 4> color{1.F, 1.F, 1.F, 1.F};
   for (size_t index = 0; index < std::min(input_spec.color_.size(), color.size()); ++index) {
     color[index] = input_spec.color_[index];
   }
@@ -1292,9 +1292,8 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
                                        buffer_info.bytes_size,
                                        cudaMemcpyDeviceToHost,
                                        cuda_stream_handler_.get_cuda_stream(context.context())));
-    // wait for the CUDA memory copy to finish
-    HOLOSCAN_CUDA_CALL(
-        cudaStreamSynchronize(cuda_stream_handler_.get_cuda_stream(context.context())));
+    // When copying from device memory to pagable memory the call is synchronous with the host
+    // execution. No need to synchronize here.
 
     buffer_info.buffer_ptr = host_buffer.data();
   }
@@ -1318,7 +1317,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
     }
     uintptr_t src_coord = reinterpret_cast<uintptr_t>(buffer_info.buffer_ptr);
     constexpr uint32_t values_per_coordinate = 3;
-    float coords[values_per_coordinate]{0.f, 0.f, 0.05f};
+    float coords[values_per_coordinate]{0.F, 0.F, 0.05F};
     for (uint32_t index = 0; index < coordinates; ++index) {
       uint32_t component_index = 0;
       // copy from source array
@@ -1361,7 +1360,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates;
         coordinate_count = primitive_count;
         values_per_coordinate = 2;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       case InputType::LINES:
         // line primitives, two coordinates (x0, y0) and (x1, y1) per primitive
@@ -1373,7 +1372,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates / 2;
         coordinate_count = primitive_count * 2;
         values_per_coordinate = 2;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       case InputType::LINE_STRIP:
         // line strip primitive, a line primitive i is defined by each coordinate (xi, yi) and
@@ -1386,7 +1385,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates - 1;
         coordinate_count = coordinates;
         values_per_coordinate = 2;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       case InputType::TRIANGLES:
         // triangle primitive, three coordinates (x0, y0), (x1, y1) and (x2, y2) per primitive
@@ -1398,7 +1397,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates / 3;
         coordinate_count = primitive_count * 3;
         values_per_coordinate = 2;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       case InputType::CROSSES:
         // cross primitive, a cross is defined by the center coordinate and the size (xi, yi,
@@ -1412,7 +1411,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates;
         coordinate_count = primitive_count;
         values_per_coordinate = 3;
-        default_coord = {0.f, 0.f, 0.05f};
+        default_coord = {0.F, 0.F, 0.05F};
         break;
       case InputType::RECTANGLES:
         // axis aligned rectangle primitive, each rectangle is defined by two coordinates (xi,
@@ -1425,7 +1424,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates / 2;
         coordinate_count = primitive_count * 2;
         values_per_coordinate = 2;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       case InputType::OVALS:
         // oval primitive, an oval primitive is defined by the center coordinate and the axis
@@ -1438,7 +1437,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates;
         coordinate_count = primitive_count;
         values_per_coordinate = 4;
-        default_coord = {0.f, 0.f, 0.05f, 0.05f};
+        default_coord = {0.F, 0.F, 0.05F, 0.05F};
         break;
       case InputType::POINTS_3D:
         // point primitives, one coordinate (x, y, z) per primitive
@@ -1450,7 +1449,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates;
         coordinate_count = primitive_count;
         values_per_coordinate = 3;
-        default_coord = {0.f, 0.f, 0.f};
+        default_coord = {0.F, 0.F, 0.F};
         break;
       case InputType::LINES_3D:
         // line primitives, two coordinates (x0, y0, z0) and (x1, y1, z1) per primitive
@@ -1462,7 +1461,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates / 2;
         coordinate_count = primitive_count * 2;
         values_per_coordinate = 3;
-        default_coord = {0.f, 0.f, 0.f};
+        default_coord = {0.F, 0.F, 0.F};
         break;
       case InputType::LINE_STRIP_3D:
         // line primitives, two coordinates (x0, y0, z0) and (x1, y1, z1) per primitive
@@ -1475,7 +1474,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates - 1;
         coordinate_count = coordinates;
         values_per_coordinate = 3;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       case InputType::TRIANGLES_3D:
         // triangle primitive, three coordinates (x0, y0, z0), (x1, y1, z1) and (x2, y2, z2)
@@ -1489,7 +1488,7 @@ void HolovizOp::render_geometry(const ExecutionContext& context, const InputSpec
         primitive_count = coordinates / 3;
         coordinate_count = primitive_count * 3;
         values_per_coordinate = 3;
-        default_coord = {0.f, 0.f};
+        default_coord = {0.F, 0.F};
         break;
       default:
         throw std::runtime_error(
@@ -1656,7 +1655,7 @@ void HolovizOp::start() {
   std::lock_guard<std::mutex> guard(mutex_);
 
   // set the font to be used
-  if (!font_path_.get().empty()) { viz::SetFont(font_path_.get().c_str(), 25.f); }
+  if (!font_path_.get().empty()) { viz::SetFont(font_path_.get().c_str(), 25.F); }
 
   // create Holoviz instance
   instance_ = viz::Create();
@@ -1675,7 +1674,7 @@ void HolovizOp::start() {
 
   if (use_exclusive_display_) {
     viz::Init(
-        display_name_.get().c_str(), width_, height_, uint32_t(framerate_ * 1000.f), init_flags);
+        display_name_.get().c_str(), width_, height_, uint32_t(framerate_ * 1000.F), init_flags);
   } else {
     viz::Init(width_,
               height_,
@@ -1810,15 +1809,6 @@ void HolovizOp::compute(InputContext& op_input, OutputContext& op_output,
   // nothing to do if minimized
   if (viz::WindowIsMinimized()) { return; }
 
-  // create vector of nvidia::gxf::Entity as expected by the code below
-  std::vector<nvidia::gxf::Entity> messages;
-  messages.reserve(receivers_messages.size());
-  for (auto& receivers_message : receivers_messages) {
-    // cast each holoscan::gxf:Entity to its base class
-    nvidia::gxf::Entity message = static_cast<nvidia::gxf::Entity>(receivers_message);
-    messages.push_back(message);
-  }
-
   // handle camera messages
   if (camera_eye_message || camera_eye_message || camera_up_message) {
     if (camera_eye_message) { camera_eye_cur_ = camera_eye_message.value(); }
@@ -1849,8 +1839,8 @@ void HolovizOp::compute(InputContext& op_input, OutputContext& op_output,
   // then get all tensors and video buffers of all messages, check if an input spec for the tensor
   // is already there, if not try to detect the input spec from the tensor or video buffer
   // information
-  for (auto&& message : messages) {
-    const auto tensors = message.findAll<nvidia::gxf::Tensor>();
+  for (auto&& message : receivers_messages) {
+    const auto tensors = message.nvidia::gxf::Entity::findAll<nvidia::gxf::Tensor>();
     for (auto&& tensor : tensors.value()) {
       // check if an input spec with the same tensor name already exist
       const std::string tensor_name(tensor->name());
@@ -1896,7 +1886,8 @@ void HolovizOp::compute(InputContext& op_input, OutputContext& op_output,
   }
 
   // get the CUDA stream from the input message
-  const gxf_result_t result = cuda_stream_handler_.from_messages(context.context(), messages);
+  const gxf_result_t result =
+      cuda_stream_handler_.from_messages(context.context(), receivers_messages);
   if (result != GXF_SUCCESS) {
     throw std::runtime_error("Failed to get the CUDA stream from incoming messages");
   }
@@ -1920,22 +1911,24 @@ void HolovizOp::compute(InputContext& op_input, OutputContext& op_output,
         nvidia::gxf::Unexpected{GXF_UNINITIALIZED_VALUE};
     nvidia::gxf::Expected<nvidia::gxf::Handle<nvidia::gxf::VideoBuffer>> maybe_input_video =
         nvidia::gxf::Unexpected{GXF_UNINITIALIZED_VALUE};
-    auto message = messages.begin();
-    while (message != messages.end()) {
-      maybe_input_tensor = message->get<nvidia::gxf::Tensor>(input_spec.tensor_name_.c_str());
+    auto message = receivers_messages.begin();
+    while (message != receivers_messages.end()) {
+      maybe_input_tensor =
+          message->nvidia::gxf::Entity::get<nvidia::gxf::Tensor>(input_spec.tensor_name_.c_str());
       if (maybe_input_tensor) {
         // pick the first one with that name
         break;
       }
 
       // check for video if no tensor found
-      maybe_input_video = message->get<nvidia::gxf::VideoBuffer>(input_spec.tensor_name_.c_str());
+      maybe_input_video = message->nvidia::gxf::Entity::get<nvidia::gxf::VideoBuffer>(
+          input_spec.tensor_name_.c_str());
       if (maybe_input_video) {  // pick the first one with that name
         break;
       }
       ++message;
     }
-    if (message == messages.end()) {
+    if (message == receivers_messages.end()) {
       throw std::runtime_error(
           fmt::format("Failed to retrieve input '{}'", input_spec.tensor_name_));
     }
diff --git a/src/operators/inference/inference.cpp b/src/operators/inference/inference.cpp
index 3d88a330..b2f6e2e6 100644
--- a/src/operators/inference/inference.cpp
+++ b/src/operators/inference/inference.cpp
@@ -159,6 +159,11 @@ void InferenceOp::setup(OperatorSpec& spec) {
              DataVecMap());
   spec.param(in_tensor_names_, "in_tensor_names", "Input Tensors", "Input tensors", {});
   spec.param(out_tensor_names_, "out_tensor_names", "Output Tensors", "Output tensors", {});
+  spec.param(trt_opt_profile_,
+             "trt_opt_profile",
+             "TensorRT Opt Profile",
+             "Optimization profile for input tensors",
+             {1, 1, 1});
   spec.param(allocator_, "allocator", "Allocator", "Output Allocator");
   spec.param(infer_on_cpu_, "infer_on_cpu", "Inference on CPU", "Use CPU.", false);
   spec.param(is_engine_path_, "is_engine_path", "Input path is engine file", "", false);
@@ -206,6 +211,7 @@ void InferenceOp::start() {
                                                     device_map_.get().get_map(),
                                                     temporal_map_.get().get_map(),
                                                     activation_map_.get().get_map(),
+                                                    trt_opt_profile_.get(),
                                                     is_engine_path_.get(),
                                                     infer_on_cpu_.get(),
                                                     parallel_inference_.get(),
@@ -233,6 +239,7 @@ void InferenceOp::start() {
 }
 
 void InferenceOp::stop() {
+  inference_specs_.reset();
   holoscan_infer_context_.reset();
 }
 
@@ -273,7 +280,8 @@ void InferenceOp::compute(InputContext& op_input, OutputContext& op_output,
 
     inference_specs_->set_activation_map(activation_map_.get().get_map());
 
-    auto status = holoscan_infer_context_->execute_inference(inference_specs_);
+    auto status = holoscan_infer_context_->execute_inference(
+        inference_specs_, cuda_stream_handler_.get_cuda_stream(cont));
     HoloInfer::timer_init(e_time);
     HoloInfer::timer_check(s_time, e_time, "Inference Operator: Inference execution");
     if (status.get_code() != HoloInfer::holoinfer_code::H_SUCCESS) {
diff --git a/src/operators/inference_processor/inference_processor.cpp b/src/operators/inference_processor/inference_processor.cpp
index bdf91d7e..aa708a94 100644
--- a/src/operators/inference_processor/inference_processor.cpp
+++ b/src/operators/inference_processor/inference_processor.cpp
@@ -27,6 +27,7 @@
 #include "holoscan/core/io_context.hpp"
 #include "holoscan/core/operator_spec.hpp"
 #include "holoscan/core/resources/gxf/allocator.hpp"
+#include "holoscan/utils/cuda_macros.hpp"
 #include "holoscan/utils/holoinfer_utils.hpp"
 
 template <>
@@ -177,10 +178,6 @@ void InferenceProcessorOp::start() {
   try {
     // Check for the validity of parameters from configuration
 
-    if (input_on_cuda_.get() || output_on_cuda_.get()) {
-      HoloInfer::raise_error(module_, "CUDA based data not supported in processor");
-    }
-
     auto status = HoloInfer::processor_validity_check(
         processed_map_.get().get_map(), in_tensor_names_.get(), out_tensor_names_.get());
     if (status.get_code() != HoloInfer::holoinfer_code::H_SUCCESS) {
@@ -206,6 +203,11 @@ void InferenceProcessorOp::start() {
   }
 }
 
+void InferenceProcessorOp::stop() {
+  data_per_tensor_.clear();
+  holoscan_postprocess_context_.reset();
+}
+
 void InferenceProcessorOp::compute(InputContext& op_input, OutputContext& op_output,
                                    ExecutionContext& context) {
   // get Handle to underlying nvidia::gxf::Allocator from std::shared_ptr<holoscan::Allocator>
@@ -213,6 +215,9 @@ void InferenceProcessorOp::compute(InputContext& op_input, OutputContext& op_out
       nvidia::gxf::Handle<nvidia::gxf::Allocator>::Create(context.context(), allocator_->gxf_cid());
   auto cont = context.context();
 
+  // process with CUDA if input is on CUDA
+  const bool process_with_cuda = input_on_cuda_.get();
+
   try {
     // Extract relevant data from input GXF Receivers, and update inference specifications
     gxf_result_t stat = holoscan::utils::get_data_per_model(op_input,
@@ -229,10 +234,13 @@ void InferenceProcessorOp::compute(InputContext& op_input, OutputContext& op_out
     HoloInfer::TimePoint s_time, e_time;
     HoloInfer::timer_init(s_time);
     // Execute processing
-    auto status = holoscan_postprocess_context_->process(process_operations_.get().get_map(),
-                                                         processed_map_.get().get_map(),
-                                                         data_per_tensor_,
-                                                         dims_per_tensor_);
+    auto status =
+        holoscan_postprocess_context_->process(process_operations_.get().get_map(),
+                                               processed_map_.get().get_map(),
+                                               data_per_tensor_,
+                                               dims_per_tensor_,
+                                               process_with_cuda,
+                                               cuda_stream_handler_.get_cuda_stream(cont));
     if (status.get_code() != HoloInfer::holoinfer_code::H_SUCCESS) {
       status.display_message();
       HoloInfer::report_error(module_, "Tick, post_process");
@@ -253,7 +261,7 @@ void InferenceProcessorOp::compute(InputContext& op_input, OutputContext& op_out
                                                       op_output,
                                                       out_tensor_names_.get(),
                                                       processed_dims_map,
-                                                      output_on_cuda_.get(),
+                                                      process_with_cuda,
                                                       transmit_on_cuda_.get(),
                                                       allocator.value(),
                                                       module_,
diff --git a/src/operators/ping_rx/ping_rx.cpp b/src/operators/ping_rx/ping_rx.cpp
index f6944140..6762d456 100644
--- a/src/operators/ping_rx/ping_rx.cpp
+++ b/src/operators/ping_rx/ping_rx.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,7 +23,8 @@ void PingRxOp::setup(OperatorSpec& spec) {
   spec.input<int>("in");
 }
 
-void PingRxOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void PingRxOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                       [[maybe_unused]] ExecutionContext& context) {
   auto value = op_input.receive<int>("in").value();
   HOLOSCAN_LOG_INFO("Rx message value: {}", value);
 }
diff --git a/src/operators/ping_tensor_rx/ping_tensor_rx.cpp b/src/operators/ping_tensor_rx/ping_tensor_rx.cpp
index 0eccc2ff..c4614999 100644
--- a/src/operators/ping_tensor_rx/ping_tensor_rx.cpp
+++ b/src/operators/ping_tensor_rx/ping_tensor_rx.cpp
@@ -27,7 +27,8 @@ void PingTensorRxOp::setup(OperatorSpec& spec) {
   spec.input<TensorMap>("in");
 }
 
-void PingTensorRxOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void PingTensorRxOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                             [[maybe_unused]] ExecutionContext& context) {
   auto maybe_in_message = op_input.receive<holoscan::TensorMap>("in");
   if (!maybe_in_message) {
     HOLOSCAN_LOG_ERROR("Failed to receive message from port 'in'");
diff --git a/src/operators/ping_tensor_tx/ping_tensor_tx.cpp b/src/operators/ping_tensor_tx/ping_tensor_tx.cpp
index a96d4132..59fb7e9d 100644
--- a/src/operators/ping_tensor_tx/ping_tensor_tx.cpp
+++ b/src/operators/ping_tensor_tx/ping_tensor_tx.cpp
@@ -120,7 +120,8 @@ nvidia::gxf::PrimitiveType PingTensorTxOp::primitive_type(const std::string& dat
   throw std::runtime_error(std::string("Unrecognized data_type: ") + data_type);
 }
 
-void PingTensorTxOp::compute(InputContext&, OutputContext& op_output, ExecutionContext& context) {
+void PingTensorTxOp::compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+                             ExecutionContext& context) {
   // the type of out_message is TensorMap
   TensorMap out_message;
 
diff --git a/src/operators/ping_tx/ping_tx.cpp b/src/operators/ping_tx/ping_tx.cpp
index f2eb6015..f683ebfa 100644
--- a/src/operators/ping_tx/ping_tx.cpp
+++ b/src/operators/ping_tx/ping_tx.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,7 +25,8 @@ void PingTxOp::setup(OperatorSpec& spec) {
   spec.output<int>("out");
 }
 
-void PingTxOp::compute(InputContext&, OutputContext& op_output, ExecutionContext&) {
+void PingTxOp::compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+                       [[maybe_unused]] ExecutionContext& context) {
   int value = index_++;
   op_output.emit(value, "out");
 }
diff --git a/src/operators/segmentation_postprocessor/segmentation_postprocessor.cu b/src/operators/segmentation_postprocessor/segmentation_postprocessor.cu
index f13d93e9..203363e2 100644
--- a/src/operators/segmentation_postprocessor/segmentation_postprocessor.cu
+++ b/src/operators/segmentation_postprocessor/segmentation_postprocessor.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -68,10 +68,10 @@ __global__ void postprocessing_kernel(Shape shape, const float* input, output_ty
   switch (network_output_type) {
     case NetworkOutputType::kSigmoid: {
       const float value = input[data_format_to_index<data_format>(shape, y, x, 0)];
-      max_index = value >= 0.5f ? 1 : 0;
+      max_index = value >= 0.5F ? 1 : 0;
     } break;
     case NetworkOutputType::kSoftmax: {
-      float max_value = 0.0f;
+      float max_value = 0.0F;
       for (uint32_t c = 0; c < shape.channels; c++) {
         const float value = input[data_format_to_index<data_format>(shape, y, x, c)];
         if (value > max_value) {
diff --git a/src/operators/v4l2_video_capture/v4l2_video_capture.cpp b/src/operators/v4l2_video_capture/v4l2_video_capture.cpp
index e98e667f..0aa9d7c2 100644
--- a/src/operators/v4l2_video_capture/v4l2_video_capture.cpp
+++ b/src/operators/v4l2_video_capture/v4l2_video_capture.cpp
@@ -210,11 +210,8 @@ void V4L2VideoCaptureOp::start() {
   v4l2_start();
 }
 
-void V4L2VideoCaptureOp::compute(InputContext& op_input, OutputContext& op_output,
+void V4L2VideoCaptureOp::compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
                                  ExecutionContext& context) {
-  // Avoid warning about unused variable
-  (void)op_input;
-
   // Read buffer.
   struct v4l2_buffer buf;
   CLEAR(buf);
diff --git a/src/operators/video_stream_recorder/video_stream_recorder.cpp b/src/operators/video_stream_recorder/video_stream_recorder.cpp
index 7c94dd64..aba281f2 100644
--- a/src/operators/video_stream_recorder/video_stream_recorder.cpp
+++ b/src/operators/video_stream_recorder/video_stream_recorder.cpp
@@ -148,11 +148,9 @@ void VideoStreamRecorderOp::stop() {
   }
 }
 
-void VideoStreamRecorderOp::compute(InputContext& op_input, OutputContext& op_output,
+void VideoStreamRecorderOp::compute(InputContext& op_input,
+                                    [[maybe_unused]] OutputContext& op_output,
                                     ExecutionContext& context) {
-  // avoid warning about unused variable
-  (void)op_output;
-
   auto entity = op_input.receive<gxf::Entity>("input").value();
 
   // dynamic cast from holoscan::Resource to holoscan::StdEntitySerializer
diff --git a/src/operators/video_stream_replayer/video_stream_replayer.cpp b/src/operators/video_stream_replayer/video_stream_replayer.cpp
index 4af6886c..e41cad13 100644
--- a/src/operators/video_stream_replayer/video_stream_replayer.cpp
+++ b/src/operators/video_stream_replayer/video_stream_replayer.cpp
@@ -80,7 +80,7 @@ void VideoStreamReplayerOp::setup(OperatorSpec& spec) {
              "frame_rate",
              "Frame rate",
              "Frame rate to replay. If zero value is specified, it follows timings in timestamps.",
-             0.f);
+             0.F);
   spec.param(realtime_,
              "realtime",
              "Realtime playback",
@@ -220,11 +220,8 @@ VideoStreamReplayerOp::~VideoStreamReplayerOp() {
   }
 }
 
-void VideoStreamReplayerOp::compute(InputContext& op_input, OutputContext& op_output,
-                                    ExecutionContext& context) {
-  // avoid warning about unused variable
-  (void)op_input;
-
+void VideoStreamReplayerOp::compute([[maybe_unused]] InputContext& op_input,
+                                    OutputContext& op_output, ExecutionContext& context) {
   for (size_t i = 0; i < batch_size_; i++) {
     // Read entity index from index file
     // Break if index not found and clear stream errors
diff --git a/src/utils/cuda_stream_handler.cpp b/src/utils/cuda_stream_handler.cpp
index 4de2dc67..279394f7 100644
--- a/src/utils/cuda_stream_handler.cpp
+++ b/src/utils/cuda_stream_handler.cpp
@@ -93,7 +93,23 @@ gxf_result_t CudaStreamHandler::fromMessage(
 }
 
 gxf_result_t CudaStreamHandler::from_messages(gxf_context_t context,
-                                              const std::vector<nvidia::gxf::Entity>& messages) {
+                           const std::vector<holoscan::gxf::Entity>& messages) {
+  // call the common internal version using the pointer to the vector data, this only works
+  // if the size of the nvidia and holoscan gxf::Entity versions is identical
+  static_assert(sizeof(holoscan::gxf::Entity) == sizeof(nvidia::gxf::Entity));
+  return from_messages(context, messages.size(), messages.data());
+}
+
+gxf_result_t CudaStreamHandler::from_messages(gxf_context_t context,
+                           const std::vector<nvidia::gxf::Entity>& messages) {
+  // call the common internal version using the pointer to the vector data, this only works
+  // if the size of the nvidia and holoscan gxf::Entity versions is identical
+  static_assert(sizeof(holoscan::gxf::Entity) == sizeof(nvidia::gxf::Entity));
+  return from_messages(context, messages.size(), messages.data());
+}
+
+gxf_result_t CudaStreamHandler::from_messages(gxf_context_t context, size_t message_count,
+                                              const nvidia::gxf::Entity* messages) {
   const gxf_result_t result = allocate_internal_stream(context);
   if (result != GXF_SUCCESS) { return result; }
 
@@ -107,8 +123,8 @@ gxf_result_t CudaStreamHandler::from_messages(gxf_context_t context,
   // iterate through all messages and use events to chain incoming streams with the internal
   // stream
   auto event_it = cuda_events_.begin();
-  for (auto& msg : messages) {
-    const auto maybe_cuda_stream_id = msg.get<nvidia::gxf::CudaStreamId>();
+  for (size_t index = 0; index < message_count; ++index) {
+    const auto maybe_cuda_stream_id = messages[index].get<nvidia::gxf::CudaStreamId>();
     if (maybe_cuda_stream_id) {
       const auto maybe_cuda_stream_handle = nvidia::gxf::Handle<nvidia::gxf::CudaStream>::Create(
           context, maybe_cuda_stream_id.value()->stream_cid);
diff --git a/src/utils/holoinfer_utils.cpp b/src/utils/holoinfer_utils.cpp
index a5d367af..607ca2e9 100644
--- a/src/utils/holoinfer_utils.cpp
+++ b/src/utils/holoinfer_utils.cpp
@@ -22,10 +22,10 @@
 #include <utility>
 #include <vector>
 
-#include "gxf/std/tensor.hpp"
 #include <holoinfer.hpp>
 #include <holoinfer_buffer.hpp>
 #include <holoinfer_utils.hpp>
+#include "gxf/std/tensor.hpp"
 #include "holoscan/core/io_context.hpp"
 #include "holoscan/utils/holoinfer_utils.hpp"
 
@@ -33,37 +33,44 @@ namespace HoloInfer = holoscan::inference;
 
 namespace holoscan::utils {
 
-template <typename T>
-gxf_result_t extract_data(nvidia::gxf::MemoryStorageType to,
+GxfTensorBuffer::GxfTensorBuffer(const holoscan::gxf::Entity& entity,
+                                 const nvidia::gxf::Handle<nvidia::gxf::Tensor>& tensor)
+    : entity_(entity), tensor_(tensor) {}
+
+void* GxfTensorBuffer::data() {
+  return reinterpret_cast<void*>(tensor_->pointer());
+}
+
+size_t GxfTensorBuffer::size() const {
+  return tensor_->element_count();
+}
+
+size_t GxfTensorBuffer::get_bytes() const {
+  return tensor_->bytes_size();
+}
+
+void GxfTensorBuffer::resize(size_t /*number_of_elements*/) {
+  throw std::runtime_error("Resizing of GxfTensorBuffer is not supported");
+}
+
+gxf_result_t extract_data(const std::shared_ptr<HoloInfer::DataBuffer>& db,
+                          nvidia::gxf::MemoryStorageType to,
                           nvidia::gxf::MemoryStorageType storage_type,
                           HoloInfer::holoinfer_datatype dtype, void* in_tensor_data,
-                          HoloInfer::DataMap& data_per_input_tensor,
-                          const std::string& current_tensor, size_t buffer_size,
-                          const std::string& module, cudaStream_t cstream) {
-  if (data_per_input_tensor.find(current_tensor) == data_per_input_tensor.end()) {
-    auto db = std::make_shared<HoloInfer::DataBuffer>(dtype);
-    db->host_buffer.resize(buffer_size);
-    db->device_buffer->resize(buffer_size);
-
-    data_per_input_tensor.insert({current_tensor, std::move(db)});
-  } else {
-    // allocate buffer for dynamic tensor size
-    auto tensor_db = data_per_input_tensor.at(current_tensor);
-    if (tensor_db->host_buffer.size() != buffer_size) {
-      tensor_db->host_buffer.resize(buffer_size);
-    }
-    if (tensor_db->device_buffer->size() != buffer_size) {
-      tensor_db->device_buffer->resize(buffer_size);
-    }
+                          size_t buffer_size, const std::string& module, cudaStream_t cstream) {
+  if (to == nvidia::gxf::MemoryStorageType::kHost) {
+    db->host_buffer_->resize(buffer_size);
+  } else if (to == nvidia::gxf::MemoryStorageType::kDevice) {
+    db->device_buffer_->resize(buffer_size);
   }
 
   if (to == nvidia::gxf::MemoryStorageType::kHost) {
-    auto in_tensor_ptr = data_per_input_tensor.at(current_tensor)->host_buffer.data();
+    auto in_tensor_ptr = db->host_buffer_->data();
 
     if (storage_type == nvidia::gxf::MemoryStorageType::kDevice) {
       cudaError_t cuda_result = cudaMemcpyAsync(in_tensor_ptr,
                                                 static_cast<const void*>(in_tensor_data),
-                                                buffer_size * sizeof(T),
+                                                buffer_size * get_element_size(dtype),
                                                 cudaMemcpyDeviceToHost,
                                                 cstream);
       if (cuda_result != cudaSuccess) {
@@ -71,24 +78,18 @@ gxf_result_t extract_data(nvidia::gxf::MemoryStorageType to,
                            cudaGetErrorString(cuda_result));
         return HoloInfer::report_error(module, "Data extraction, DtoH cudaMemcpy.");
       }
-
-      cuda_result = cudaStreamSynchronize(cstream);
-      if (cuda_result != cudaSuccess) {
-        HOLOSCAN_LOG_ERROR("Cuda stream synchronization failed: {}",
-                           cudaGetErrorString(cuda_result));
-        return HoloInfer::report_error(module, "Data extraction, Stream synchronization.");
-      }
+      // When copying from device memory to pagable memory the call is synchronous with the host
+      // execution. No need to synchronize here.
     } else if (storage_type == nvidia::gxf::MemoryStorageType::kHost) {
-      memcpy(
-          static_cast<T*>(in_tensor_ptr), static_cast<T*>(in_tensor_data), buffer_size * sizeof(T));
+      memcpy(in_tensor_ptr, in_tensor_data, buffer_size * get_element_size(dtype));
     }
   } else {
     if (to == nvidia::gxf::MemoryStorageType::kDevice) {
-      void* device_buff = data_per_input_tensor.at(current_tensor)->device_buffer->data();
+      void* device_buff = db->device_buffer_->data();
       if (storage_type == nvidia::gxf::MemoryStorageType::kDevice) {
         cudaError_t cuda_result = cudaMemcpyAsync(static_cast<void*>(device_buff),
                                                   static_cast<const void*>(in_tensor_data),
-                                                  buffer_size * sizeof(T),
+                                                  buffer_size * get_element_size(dtype),
                                                   cudaMemcpyDeviceToDevice,
                                                   cstream);
 
@@ -119,38 +120,32 @@ gxf_result_t get_data_per_model(InputContext& op_input, const std::vector<std::s
     if (cuda_buffer_out) { to = nvidia::gxf::MemoryStorageType::kDevice; }
 
     auto messages = op_input.receive<std::vector<holoscan::gxf::Entity>>("receivers").value();
+    // get the CUDA stream from the input messages
+    if (cuda_stream_handler.from_messages(context, messages) != GXF_SUCCESS) {
+      throw std::runtime_error("Failed to get the CUDA stream from incoming messages");
+    }
+    const cudaStream_t cstream = cuda_stream_handler.get_cuda_stream(context);
     for (unsigned int i = 0; i < in_tensors.size(); ++i) {
-      // nvidia::gxf::Handle<nvidia::gxf::Tensor> in_tensor;
       HOLOSCAN_LOG_DEBUG("Extracting data from tensor {}", in_tensors[i]);
-      std::shared_ptr<holoscan::Tensor> in_tensor;
-      cudaStream_t cstream = 0;
+      nvidia::gxf::Expected<nvidia::gxf::Handle<nvidia::gxf::Tensor>> maybe_in_tensor =
+          nvidia::gxf::Unexpected{GXF_UNINITIALIZED_VALUE};
+      size_t message_index;
       for (unsigned int j = 0; j < messages.size(); ++j) {
-        const auto& in_message = messages[j];
-        const auto maybe_tensor = in_message.get<holoscan::Tensor>(in_tensors[i].c_str(), false);
-        if (maybe_tensor) {
+        maybe_in_tensor =
+            messages[j].nvidia::gxf::Entity::get<nvidia::gxf::Tensor>(in_tensors[i].c_str());
+        if (maybe_in_tensor) {
           // break out if the expected tensor name was found in this message
-          in_tensor = maybe_tensor;
-          //   get the CUDA stream from the input message
-          gxf_result_t stream_handler_result =
-              cuda_stream_handler.from_message(context, in_message);
-          if (stream_handler_result != GXF_SUCCESS) {
-            throw std::runtime_error("Failed to get the CUDA stream from incoming messages");
-          }
-          cstream = cuda_stream_handler.get_cuda_stream(context);
+          message_index = j;
           break;
         }
       }
-      if (!in_tensor)
+      if (!maybe_in_tensor) {
         return HoloInfer::report_error(module,
                                        "Data extraction, Tensor " + in_tensors[i] + " not found");
+      }
 
-      // convert from Tensor to nvidia::gxf::Tensor so code below can be re-used as-is.
-      // (otherwise cannot easily get element_type, storage_type)
-      nvidia::gxf::Tensor in_tensor_gxf{in_tensor->dl_ctx()};
-      void* in_tensor_data = in_tensor_gxf.pointer();
-
-      auto element_type = in_tensor_gxf.element_type();
-      auto storage_type = in_tensor_gxf.storage_type();
+      const auto& in_tensor = maybe_in_tensor.value();
+      const auto storage_type = in_tensor->storage_type();
 
       if (storage_type != nvidia::gxf::MemoryStorageType::kHost &&
           storage_type != nvidia::gxf::MemoryStorageType::kDevice) {
@@ -163,69 +158,23 @@ gxf_result_t get_data_per_model(InputContext& op_input, const std::vector<std::s
                                        "Input storage type in data extraction not supported.");
       }
 
-      std::vector<int> dims;
-      for (unsigned int di = 0; di < in_tensor_gxf.shape().rank(); ++di)
-        dims.push_back(in_tensor_gxf.shape().dimension(di));
-
-      size_t buffer_size = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
-      dims_per_tensor[in_tensors[i]] = std::move(dims);
-
-      gxf_result_t status = GXF_SUCCESS;
+      HoloInfer::holoinfer_datatype dtype;
+      const auto element_type = in_tensor->element_type();
       switch (element_type) {
         case nvidia::gxf::PrimitiveType::kFloat32:
-          status = extract_data<float>(to,
-                                       storage_type,
-                                       HoloInfer::holoinfer_datatype::h_Float32,
-                                       in_tensor_data,
-                                       data_per_input_tensor,
-                                       in_tensors[i],
-                                       buffer_size,
-                                       module,
-                                       cstream);
+          dtype = HoloInfer::holoinfer_datatype::h_Float32;
           break;
         case nvidia::gxf::PrimitiveType::kInt32:
-          status = extract_data<int32_t>(to,
-                                         storage_type,
-                                         HoloInfer::holoinfer_datatype::h_Int32,
-                                         in_tensor_data,
-                                         data_per_input_tensor,
-                                         in_tensors[i],
-                                         buffer_size,
-                                         module,
-                                         cstream);
+          dtype = HoloInfer::holoinfer_datatype::h_Int32;
           break;
         case nvidia::gxf::PrimitiveType::kInt8:
-          status = extract_data<int8_t>(to,
-                                        storage_type,
-                                        HoloInfer::holoinfer_datatype::h_Int8,
-                                        in_tensor_data,
-                                        data_per_input_tensor,
-                                        in_tensors[i],
-                                        buffer_size,
-                                        module,
-                                        cstream);
+          dtype = HoloInfer::holoinfer_datatype::h_Int8;
           break;
         case nvidia::gxf::PrimitiveType::kInt64:
-          status = extract_data<int64_t>(to,
-                                         storage_type,
-                                         HoloInfer::holoinfer_datatype::h_Int64,
-                                         in_tensor_data,
-                                         data_per_input_tensor,
-                                         in_tensors[i],
-                                         buffer_size,
-                                         module,
-                                         cstream);
+          dtype = HoloInfer::holoinfer_datatype::h_Int64;
           break;
         case nvidia::gxf::PrimitiveType::kUnsigned8:
-          status = extract_data<uint8_t>(to,
-                                         storage_type,
-                                         HoloInfer::holoinfer_datatype::h_Int8,
-                                         in_tensor_data,
-                                         data_per_input_tensor,
-                                         in_tensors[i],
-                                         buffer_size,
-                                         module,
-                                         cstream);
+          dtype = HoloInfer::holoinfer_datatype::h_UInt8;
           break;
         default: {
           HOLOSCAN_LOG_INFO("Incoming tensors must be of type: float, int32, int64, int8, uint8");
@@ -233,8 +182,41 @@ gxf_result_t get_data_per_model(InputContext& op_input, const std::vector<std::s
                                          "Data extraction, data type not supported in extraction.");
         }
       }
-      if (status != GXF_SUCCESS) {
-        return HoloInfer::report_error(module, "Data extraction, In tensor extraction failed.");
+
+      auto data_map = data_per_input_tensor.find(in_tensors[i]);
+      if (data_map == data_per_input_tensor.end()) {
+        data_map = data_per_input_tensor
+                       .insert({in_tensors[i], std::make_shared<HoloInfer::DataBuffer>(dtype)})
+                       .first;
+      }
+      auto& db = data_map->second;
+
+      std::vector<int> dims;
+      for (unsigned int di = 0; di < in_tensor->shape().rank(); ++di) {
+        dims.push_back(in_tensor->shape().dimension(di));
+      }
+      dims_per_tensor[in_tensors[i]] = std::move(dims);
+
+      if (to == storage_type) {
+        auto buffer =
+            std::make_shared<GxfTensorBuffer>(messages[message_index], in_tensor);
+        if (to == nvidia::gxf::MemoryStorageType::kDevice) {
+          db->device_buffer_ = buffer;
+        } else {
+          db->host_buffer_ = buffer;
+        }
+      } else {
+        gxf_result_t status = extract_data(db,
+                                           to,
+                                           storage_type,
+                                           dtype,
+                                           in_tensor->pointer(),
+                                           in_tensor->element_count(),
+                                           module,
+                                           cstream);
+        if (status != GXF_SUCCESS) {
+          return HoloInfer::report_error(module, "Data extraction, In tensor extraction failed.");
+        }
       }
     }
 
@@ -270,7 +252,7 @@ gxf_result_t transmit_data(nvidia::gxf::MemoryStorageType from, nvidia::gxf::Mem
 
       auto current_model_output = input_data_map.at(current_tensor);
       memcpy(out_tensor_data.value(),
-             current_model_output->host_buffer.data(),
+             current_model_output->host_buffer_->data(),
              buffer_size * sizeof(T));
     } else {  // to is on device
       out_tensor.value()->reshape<T>(
@@ -282,7 +264,7 @@ gxf_result_t transmit_data(nvidia::gxf::MemoryStorageType from, nvidia::gxf::Mem
       if (!out_tensor_data)
         return HoloInfer::report_error(module, "Data transmission, Getting out tensor data.");
 
-      auto current_model_dev_buff = input_data_map.at(current_tensor)->host_buffer.data();
+      auto current_model_dev_buff = input_data_map.at(current_tensor)->host_buffer_->data();
       cudaError_t cuda_result = cudaMemcpyAsync(static_cast<void*>(out_tensor_data.value()),
                                                 static_cast<const void*>(current_model_dev_buff),
                                                 buffer_size * sizeof(T),
@@ -292,12 +274,9 @@ gxf_result_t transmit_data(nvidia::gxf::MemoryStorageType from, nvidia::gxf::Mem
         HOLOSCAN_LOG_ERROR("Data transmission (HtoD) failed: {}", cudaGetErrorString(cuda_result));
         return HoloInfer::report_error(module, "Data Transmission, HtoD cudaMemcpy.");
       }
-      cuda_result = cudaStreamSynchronize(cstream);
-      if (cuda_result != cudaSuccess) {
-        HOLOSCAN_LOG_ERROR("Cuda stream synchronization failed: {}",
-                           cudaGetErrorString(cuda_result));
-        return HoloInfer::report_error(module, "Data transmission, Stream synchronization.");
-      }
+      // When copying from pagable memory to device memory cudaMemcpyAsync() is copying the memory
+      // to staging memory first and therefore is synchronous with the host execution. No need to
+      // synchronize here.
     }
   } else {
     if (from == nvidia::gxf::MemoryStorageType::kDevice) {
@@ -311,7 +290,7 @@ gxf_result_t transmit_data(nvidia::gxf::MemoryStorageType from, nvidia::gxf::Mem
         if (!out_tensor_data)
           return HoloInfer::report_error(module, "Data Transmission, getting out tensor data.");
 
-        void* current_model_dev_buff = input_data_map.at(current_tensor)->device_buffer->data();
+        void* current_model_dev_buff = input_data_map.at(current_tensor)->device_buffer_->data();
         cudaError_t cuda_result = cudaMemcpyAsync(static_cast<void*>(out_tensor_data.value()),
                                                   static_cast<const void*>(current_model_dev_buff),
                                                   buffer_size * sizeof(T),
@@ -332,7 +311,7 @@ gxf_result_t transmit_data(nvidia::gxf::MemoryStorageType from, nvidia::gxf::Mem
         if (!out_tensor_data)
           return HoloInfer::report_error(module, "Data Transmission, getting out tensor data");
 
-        void* current_model_dev_buff = input_data_map.at(current_tensor)->device_buffer->data();
+        void* current_model_dev_buff = input_data_map.at(current_tensor)->device_buffer_->data();
         cudaError_t cuda_result = cudaMemcpyAsync(static_cast<void*>(out_tensor_data.value()),
                                                   static_cast<const void*>(current_model_dev_buff),
                                                   buffer_size * sizeof(T),
@@ -343,12 +322,8 @@ gxf_result_t transmit_data(nvidia::gxf::MemoryStorageType from, nvidia::gxf::Mem
                              cudaGetErrorString(cuda_result));
           return HoloInfer::report_error(module, "Data transmission, DtoH cudaMemcpy");
         }
-        cuda_result = cudaStreamSynchronize(cstream);
-        if (cuda_result != cudaSuccess) {
-          HOLOSCAN_LOG_ERROR("Cuda stream synchronization failed: {}",
-                             cudaGetErrorString(cuda_result));
-          return HoloInfer::report_error(module, "Data transmission, Stream synchronization.");
-        }
+        // When copying from device memory to pagable memory the call is synchronous with the host
+        // execution. No need to synchronize here.
       }
     }
   }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 06808942..f468fea8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -31,7 +31,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_include_directories(${CMAKE_TEST_NAME}
     PRIVATE
-    ${HOLOSCAN_TOP}/gxf_extensions # TODO: expose in targets instead
+    ${HOLOSCAN_TOP}/gxf_extensions # TODO(unknown): expose in targets instead
   )
 
   target_link_libraries(${CMAKE_TEST_NAME}
diff --git a/tests/codecs/codecs.cpp b/tests/codecs/codecs.cpp
index 6f6a5271..43a866bd 100644
--- a/tests/codecs/codecs.cpp
+++ b/tests/codecs/codecs.cpp
@@ -156,7 +156,7 @@ void codec_shared_vector_compare(std::shared_ptr<dataT> value, size_t buffer_siz
   }
 }
 
-// TODO: update size check here
+// TODO(unknown): update size check here
 template <typename dataT>
 void codec_vector_vector_compare(dataT& vectors, size_t buffer_size = 4096,
                                  bool omit_size_check = true, bool omit_values_check = false) {
@@ -185,7 +185,7 @@ void codec_vector_vector_compare(dataT& vectors, size_t buffer_size = 4096,
   }
 }
 
-// TODO: update size check here
+// TODO(unknown): update size check here
 template <typename dataT>
 void codec_shared_vector_vector_compare(std::shared_ptr<dataT> vectors, size_t buffer_size = 4096,
                                         bool omit_size_check = true,
diff --git a/tests/core/app_driver.cpp b/tests/core/app_driver.cpp
index 8c2bec36..1294d6f9 100644
--- a/tests/core/app_driver.cpp
+++ b/tests/core/app_driver.cpp
@@ -70,10 +70,16 @@ TEST(AppDriver, TestSetUcxToExcludeCudaIpc) {
 TEST(AppDriver, TestExcludeCudaIpcTransportOnIgpu) {
   const char* env_orig = std::getenv("UCX_TLS");
 
-  holoscan::GPUResourceMonitor gpu_resource_monitor;
-  gpu_resource_monitor.update();
-  bool is_integrated =
-      (gpu_resource_monitor.num_gpus() > 0) && gpu_resource_monitor.is_integrated_gpu(0);
+  bool is_integrated = false;
+  // Ensure that GPUResourceMonitor is instantiated in a separate scope to avoid nested nvmlInit()
+  // calls (AppDriver::exclude_cuda_ipc_transport_on_igpu() creates a GPUResourceMonitor instance).
+  {
+    // Check if we are running on an integrated GPU.
+    holoscan::GPUResourceMonitor gpu_resource_monitor;
+    gpu_resource_monitor.update();
+    is_integrated =
+        (gpu_resource_monitor.num_gpus() > 0) && gpu_resource_monitor.is_integrated_gpu(0);
+  }
 
   // if unset and on iGPU, will be set to ^cuda_ipc
   if (env_orig) { unsetenv("UCX_TLS"); }
diff --git a/tests/core/arg.cpp b/tests/core/arg.cpp
index 01f4d981..08ac1644 100644
--- a/tests/core/arg.cpp
+++ b/tests/core/arg.cpp
@@ -399,8 +399,8 @@ TEST(Yaml, TestYamlCplxDecode) {
   // works with spaces around + and with "j" to indicate imaginary component
   YAML::Node node = YAML::Load("2.0 + 1.5j");
   std::complex<float> cf = node.as<std::complex<float>>();
-  EXPECT_EQ(cf.real(), 2.0f);
-  EXPECT_EQ(cf.imag(), 1.5f);
+  EXPECT_EQ(cf.real(), 2.0F);
+  EXPECT_EQ(cf.imag(), 1.5F);
 
   // works without white space and with "i" to indicate imaginary component
   node = YAML::Load("-2.102-3i");
diff --git a/tests/core/component.cpp b/tests/core/component.cpp
index 7372aed5..244f2219 100644
--- a/tests/core/component.cpp
+++ b/tests/core/component.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,7 +41,7 @@ TEST(Component, TestComponentAddArg) {
 
   // add ConstArg
   Arg a2 = Arg("alpha2");
-  a2 = 3.0f;
+  a2 = 3.0F;
   const Arg ca2 = a2;
   C.add_arg(ca2);
 
diff --git a/tests/core/component_spec.cpp b/tests/core/component_spec.cpp
index c5ac641e..ce96a577 100644
--- a/tests/core/component_spec.cpp
+++ b/tests/core/component_spec.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -97,7 +97,7 @@ TEST(ComponentSpec, TestComponentSpecDefaultLValue) {
   ComponentSpec spec;
 
   // add a parameter without any value
-  uint32_t default_val = 15u;
+  uint32_t default_val = 15U;
   MetaParameter empty_int = Parameter<uint32_t>();
   spec.param(empty_int, "beta3", "headline3", "description3", default_val);
   auto params = spec.params();
diff --git a/tests/core/condition_classes.cpp b/tests/core/condition_classes.cpp
index 7f4aa99e..7f66230a 100644
--- a/tests/core/condition_classes.cpp
+++ b/tests/core/condition_classes.cpp
@@ -32,6 +32,9 @@
 #include "holoscan/core/conditions/gxf/asynchronous.hpp"
 #include "holoscan/core/conditions/gxf/boolean.hpp"
 #include "holoscan/core/conditions/gxf/count.hpp"
+#include "holoscan/core/conditions/gxf/cuda_buffer_available.hpp"
+#include "holoscan/core/conditions/gxf/cuda_event.hpp"
+#include "holoscan/core/conditions/gxf/cuda_stream.hpp"
 #include "holoscan/core/conditions/gxf/downstream_affordable.hpp"
 #include "holoscan/core/conditions/gxf/periodic.hpp"
 #include "holoscan/core/conditions/gxf/message_available.hpp"
@@ -143,7 +146,7 @@ TEST(ConditionClasses, TestCountConditionGXFComponentMethods) {
 TEST_F(ConditionClassesWithGXFContext, TestCountConditionInitializeWithoutSpec) {
   CountCondition count{10};
   count.fragment(&F);
-  // TODO: avoid segfault if initialize is called before the fragment is assigned
+  // TODO(unknown): avoid segfault if initialize is called before the fragment is assigned
 
   // test that an error is logged if initialize is called before a spec as assigned
   testing::internal::CaptureStderr();
@@ -214,19 +217,6 @@ TEST(ConditionClasses, TestMessageAvailableCondition) {
   EXPECT_TRUE(condition->description().find("name: " + name) != std::string::npos);
 }
 
-TEST(ConditionClasses, TestExpiringMessageAvailableCondition) {
-  Fragment F;
-  const std::string name{"expiring-message-available-condition"};
-  ArgList arglist{Arg{"min_size", 1L}, Arg{"front_stage_max_size", 2L}};
-  auto condition = F.make_condition<ExpiringMessageAvailableCondition>(name, arglist);
-  EXPECT_EQ(condition->name(), name);
-  EXPECT_EQ(typeid(condition),
-            typeid(std::make_shared<ExpiringMessageAvailableCondition>(arglist)));
-  EXPECT_EQ(std::string(condition->gxf_typename()),
-            "nvidia::gxf::ExpiringMessageAvailableSchedulingTerm"s);
-  EXPECT_TRUE(condition->description().find("name: " + name) != std::string::npos);
-}
-
 TEST(ConditionClasses, TestMessageAvailableConditionDefaultConstructor) {
   Fragment F;
   auto condition = F.make_condition<MessageAvailableCondition>();
@@ -245,6 +235,24 @@ TEST(ConditionClasses, TestMessageAvailableConditionSizeMethods) {
   EXPECT_EQ(condition->front_stage_max_size(), 5);
 }
 
+TEST(ConditionClasses, TestExpiringMessageAvailableCondition) {
+  Fragment F;
+  const std::string name{"expiring-message-available-condition"};
+  ArgList arglist{Arg{"min_size", 1L}, Arg{"front_stage_max_size", 2L}};
+  auto condition = F.make_condition<ExpiringMessageAvailableCondition>(name, arglist);
+  EXPECT_EQ(condition->name(), name);
+  EXPECT_EQ(typeid(condition),
+            typeid(std::make_shared<ExpiringMessageAvailableCondition>(arglist)));
+  EXPECT_EQ(std::string(condition->gxf_typename()),
+            "nvidia::gxf::ExpiringMessageAvailableSchedulingTerm"s);
+  EXPECT_TRUE(condition->description().find("name: " + name) != std::string::npos);
+}
+
+TEST(ConditionClasses, TestExpiringMessageAvailableConditionDefaultConstructor) {
+  Fragment F;
+  auto condition = F.make_condition<ExpiringMessageAvailableCondition>();
+}
+
 TEST(ConditionClasses, TestPeriodicCondition) {
   Fragment F;
   const std::string name{"periodic-condition"};
@@ -349,7 +357,7 @@ TEST(ConditionClasses, TestPeriodicConditionGXFComponentMethods) {
 TEST_F(ConditionClassesWithGXFContext, TestPeriodicConditionInitializeWithoutSpec) {
   PeriodicCondition periodic{1000000};
   periodic.fragment(&F);
-  // TODO: avoid segfault if initialize is called before the fragment is assigned
+  // TODO(unknown): avoid segfault if initialize is called before the fragment is assigned
 
   // test that an error is logged if initialize is called before a spec as assigned
   testing::internal::CaptureStderr();
@@ -421,4 +429,51 @@ TEST_F(ConditionClassesWithGXFContext, TestPeriodicConditionInitializeWithUnreco
       << log_output << "\n===========\n";
 }
 
+TEST(ConditionClasses, TestCudaBufferAvailableCondition) {
+  Fragment F;
+  const std::string name{"cuda-buffer-available-condition"};
+  auto condition = F.make_condition<CudaBufferAvailableCondition>(name);
+  EXPECT_EQ(condition->name(), name);
+  EXPECT_EQ(typeid(condition), typeid(std::make_shared<CudaBufferAvailableCondition>()));
+  EXPECT_EQ(std::string(condition->gxf_typename()),
+            "nvidia::gxf::CudaBufferAvailableSchedulingTerm"s);
+  EXPECT_TRUE(condition->description().find("name: " + name) != std::string::npos);
+}
+
+TEST(ConditionClasses, TestCudaBufferAvailableConditionDefaultConstructor) {
+  Fragment F;
+  auto condition = F.make_condition<CudaBufferAvailableCondition>();
+}
+
+TEST(ConditionClasses, TestCudaStreamCondition) {
+  Fragment F;
+  const std::string name{"cuda-stream-condition"};
+  auto condition = F.make_condition<CudaStreamCondition>(name);
+  EXPECT_EQ(condition->name(), name);
+  EXPECT_EQ(typeid(condition), typeid(std::make_shared<CudaStreamCondition>()));
+  EXPECT_EQ(std::string(condition->gxf_typename()), "nvidia::gxf::CudaStreamSchedulingTerm"s);
+  EXPECT_TRUE(condition->description().find("name: " + name) != std::string::npos);
+}
+
+TEST(ConditionClasses, TestCudaStreamConditionDefaultConstructor) {
+  Fragment F;
+  auto condition = F.make_condition<CudaStreamCondition>();
+}
+
+TEST(ConditionClasses, TestCudaEventCondition) {
+  Fragment F;
+  const std::string name{"cuda-event-condition"};
+  const std::string event_name{"cuda-event"};
+  auto condition = F.make_condition<CudaEventCondition>(name, Arg{"event_name", event_name});
+  EXPECT_EQ(condition->name(), name);
+  EXPECT_EQ(typeid(condition), typeid(std::make_shared<CudaEventCondition>()));
+  EXPECT_EQ(std::string(condition->gxf_typename()), "nvidia::gxf::CudaEventSchedulingTerm"s);
+  EXPECT_TRUE(condition->description().find("name: " + name) != std::string::npos);
+}
+
+TEST(ConditionClasses, TestCudaEventConditionDefaultConstructor) {
+  Fragment F;
+  auto condition = F.make_condition<CudaEventCondition>();
+}
+
 }  // namespace holoscan
diff --git a/tests/core/fragment.cpp b/tests/core/fragment.cpp
index 4e8f310f..bb0c9884 100644
--- a/tests/core/fragment.cpp
+++ b/tests/core/fragment.cpp
@@ -75,7 +75,7 @@ TEST(Fragment, TestFragmentAssignApplication) {
   delete A;
 }
 
-// TODO: how to properly specify path to the config file here
+// TODO(unknown): how to properly specify path to the config file here
 //       ? maybe define a path in the CMake config that we can reference here?
 
 TEST(Fragment, TestFragmentConfig) {
diff --git a/tests/core/fragment_allocation.cpp b/tests/core/fragment_allocation.cpp
index acb6580d..b73f8064 100644
--- a/tests/core/fragment_allocation.cpp
+++ b/tests/core/fragment_allocation.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,10 +36,10 @@ namespace holoscan {
 
 // struct SystemResourceRequirement {
 //   std::string fragment_name;
-//   float cpu = -1.0f;
-//   float cpu_limit = -1.0f;
-//   float gpu = -1.0f;
-//   float gpu_limit = -1.0f;
+//   float cpu = -1.0F;
+//   float cpu_limit = -1.0F;
+//   float gpu = -1.0F;
+//   float gpu_limit = -1.0F;
 //   uint64_t memory = 0;
 //   uint64_t memory_limit = 0;
 //   uint64_t shared_memory = 0;
diff --git a/tests/core/io_spec.cpp b/tests/core/io_spec.cpp
index d4117740..faef146b 100644
--- a/tests/core/io_spec.cpp
+++ b/tests/core/io_spec.cpp
@@ -210,13 +210,13 @@ TEST(IOSpec, TestIOSpecConnectorUcxReceiver) {
   ASSERT_TRUE(spec.connector() != nullptr);
   EXPECT_EQ(typeid(spec.connector()), typeid(std::make_shared<Resource>()));
   auto receiver = std::dynamic_pointer_cast<UcxReceiver>(spec.connector());
-  EXPECT_EQ(std::string(receiver->gxf_typename()), std::string("nvidia::gxf::UcxReceiver"));
+  EXPECT_EQ(std::string(receiver->gxf_typename()), std::string("holoscan::HoloscanUcxReceiver"));
 
   // two arguments
   spec.connector(IOSpec::ConnectorType::kUCX, Arg("capacity", 2), Arg("policy", 1));
   EXPECT_EQ(spec.connector_type(), IOSpec::ConnectorType::kUCX);
   receiver = std::dynamic_pointer_cast<UcxReceiver>(spec.connector());
-  EXPECT_EQ(std::string(receiver->gxf_typename()), std::string("nvidia::gxf::UcxReceiver"));
+  EXPECT_EQ(std::string(receiver->gxf_typename()), std::string("holoscan::HoloscanUcxReceiver"));
 
   // arglist
   spec.connector(IOSpec::ConnectorType::kUCX,
@@ -226,7 +226,7 @@ TEST(IOSpec, TestIOSpecConnectorUcxReceiver) {
                          Arg("port", static_cast<uint32_t>(13337))});
   EXPECT_EQ(spec.connector_type(), IOSpec::ConnectorType::kUCX);
   receiver = std::dynamic_pointer_cast<UcxReceiver>(spec.connector());
-  EXPECT_EQ(std::string(receiver->gxf_typename()), std::string("nvidia::gxf::UcxReceiver"));
+  EXPECT_EQ(std::string(receiver->gxf_typename()), std::string("holoscan::HoloscanUcxReceiver"));
 }
 
 TEST(IOSpec, TestIOSpecConnectorDoubleBufferTransmitter) {
@@ -277,13 +277,15 @@ TEST(IOSpec, TestIOSpecConnectorUcxTransmitter) {
   ASSERT_TRUE(spec.connector() != nullptr);
   EXPECT_EQ(typeid(spec.connector()), typeid(std::make_shared<Resource>()));
   auto transmitter = std::dynamic_pointer_cast<UcxTransmitter>(spec.connector());
-  EXPECT_EQ(std::string(transmitter->gxf_typename()), std::string("nvidia::gxf::UcxTransmitter"));
+  EXPECT_EQ(std::string(transmitter->gxf_typename()),
+            std::string("holoscan::HoloscanUcxTransmitter"));
 
   // two arguments
   spec.connector(IOSpec::ConnectorType::kUCX, Arg("capacity", 2), Arg("policy", 1));
   EXPECT_EQ(spec.connector_type(), IOSpec::ConnectorType::kUCX);
   transmitter = std::dynamic_pointer_cast<UcxTransmitter>(spec.connector());
-  EXPECT_EQ(std::string(transmitter->gxf_typename()), std::string("nvidia::gxf::UcxTransmitter"));
+  EXPECT_EQ(std::string(transmitter->gxf_typename()),
+            std::string("holoscan::HoloscanUcxTransmitter"));
 
   // arglist
   spec.connector(IOSpec::ConnectorType::kUCX,
@@ -295,7 +297,8 @@ TEST(IOSpec, TestIOSpecConnectorUcxTransmitter) {
                          Arg("local_port", static_cast<uint32_t>(0))});
   EXPECT_EQ(spec.connector_type(), IOSpec::ConnectorType::kUCX);
   transmitter = std::dynamic_pointer_cast<UcxTransmitter>(spec.connector());
-  EXPECT_EQ(std::string(transmitter->gxf_typename()), std::string("nvidia::gxf::UcxTransmitter"));
+  EXPECT_EQ(std::string(transmitter->gxf_typename()),
+            std::string("holoscan::HoloscanUcxTransmitter"));
 }
 
 TEST(IOSpec, TestIOSpecQueueSize) {
diff --git a/tests/core/metadata.cpp b/tests/core/metadata.cpp
index 4d891570..46eba698 100644
--- a/tests/core/metadata.cpp
+++ b/tests/core/metadata.cpp
@@ -197,14 +197,14 @@ TEST(MetadataDictionary, TestMetadataPolicy) {
   d.set("patient name", "Mr. Smith"s);
   EXPECT_EQ(d.get<std::string>("patient name"), "John Doe"s);
 
-  // update using new metadta object
+  // update using new metadata object
   auto shared_obj = d.get("patient name");
   d.policy(MetadataPolicy::kUpdate);
   d.set("patient name", "Mr. Smith"s);
   EXPECT_EQ(d.get<std::string>("patient name"), "Mr. Smith"s);
   EXPECT_EQ(std::any_cast<std::string>(shared_obj->value()), "John Doe"s);
 
-  // update existing metadta object in-place
+  // update existing metadata object in-place
   shared_obj = d.get("patient name");
   d.policy(MetadataPolicy::kInplaceUpdate);
   d.set("patient name", "Mr. Nobody"s);
diff --git a/tests/core/native_operator.cpp b/tests/core/native_operator.cpp
index a92847d3..6b9b0dbc 100644
--- a/tests/core/native_operator.cpp
+++ b/tests/core/native_operator.cpp
@@ -45,7 +45,8 @@ class PingTxOp : public Operator {
     spec.output<int>("out2");
   }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = std::make_shared<int>(1);
     op_output.emit(value1, "out1");
 
@@ -64,7 +65,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<std::vector<int>>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value_vector = op_input.receive<std::vector<int>>("receivers").value();
 
     HOLOSCAN_LOG_INFO("Rx message received (count: {}, size: {})", count_++, value_vector.size());
diff --git a/tests/core/resource_classes.cpp b/tests/core/resource_classes.cpp
index d5c0ab30..03dd1768 100644
--- a/tests/core/resource_classes.cpp
+++ b/tests/core/resource_classes.cpp
@@ -24,6 +24,7 @@
 
 #include "../config.hpp"
 #include "../utils.hpp"
+#include "common/assert.hpp"
 #include "holoscan/core/arg.hpp"
 #include "holoscan/core/component_spec.hpp"
 #include "holoscan/core/config.hpp"
@@ -37,9 +38,11 @@
 #include "holoscan/core/resources/gxf/double_buffer_transmitter.hpp"
 #include "holoscan/core/resources/gxf/manual_clock.hpp"
 #include "holoscan/core/resources/gxf/realtime_clock.hpp"
+#include "holoscan/core/resources/gxf/rmm_allocator.hpp"
 #include "holoscan/core/resources/gxf/serialization_buffer.hpp"
 #include "holoscan/core/resources/gxf/std_component_serializer.hpp"
 #include "holoscan/core/resources/gxf/std_entity_serializer.hpp"
+#include "holoscan/core/resources/gxf/stream_ordered_allocator.hpp"
 #include "holoscan/core/resources/gxf/ucx_component_serializer.hpp"
 #include "holoscan/core/resources/gxf/ucx_entity_serializer.hpp"
 #include "holoscan/core/resources/gxf/ucx_holoscan_component_serializer.hpp"
@@ -47,7 +50,6 @@
 #include "holoscan/core/resources/gxf/ucx_serialization_buffer.hpp"
 #include "holoscan/core/resources/gxf/ucx_transmitter.hpp"
 #include "holoscan/core/resources/gxf/unbounded_allocator.hpp"
-#include "common/assert.hpp"
 
 using namespace std::string_literals;
 
@@ -93,6 +95,45 @@ TEST_F(ResourceClassesWithGXFContext, TestCudaStreamPoolDefaultConstructor) {
   auto resource = F.make_resource<CudaStreamPool>();
 }
 
+TEST_F(ResourceClassesWithGXFContext, TestRMMAllocator) {
+  const std::string name{"rmm-pool"};
+  ArgList arglist{
+      Arg{"device_memory_initial_size", std::string{"10MB"}},
+      Arg{"device_memory_max_size", std::string{"20MB"}},
+      Arg{"host_memory_initial_size", std::string{"10MB"}},
+      Arg{"host_memory_max_size", std::string{"20MB"}},
+      Arg{"dev_id", static_cast<int32_t>(0)},
+  };
+  auto resource = F.make_resource<RMMAllocator>(name, arglist);
+  EXPECT_EQ(resource->name(), name);
+  EXPECT_EQ(typeid(resource), typeid(std::make_shared<RMMAllocator>(arglist)));
+  EXPECT_EQ(std::string(resource->gxf_typename()), "nvidia::gxf::RMMAllocator"s);
+  EXPECT_TRUE(resource->description().find("name: " + name) != std::string::npos);
+}
+
+TEST_F(ResourceClassesWithGXFContext, TestRMMAllocatorDefaultConstructor) {
+  auto resource = F.make_resource<RMMAllocator>();
+}
+
+TEST_F(ResourceClassesWithGXFContext, TestStreamOrderedAllocator) {
+  const std::string name{"rmm-pool"};
+  ArgList arglist{
+      Arg{"device_memory_initial_size", std::string{"10MB"}},
+      Arg{"device_memory_max_size", std::string{"20MB"}},
+      Arg{"release_threadhold", std::string{"0B"}},
+      Arg{"dev_id", static_cast<int32_t>(0)},
+  };
+  auto resource = F.make_resource<StreamOrderedAllocator>(name, arglist);
+  EXPECT_EQ(resource->name(), name);
+  EXPECT_EQ(typeid(resource), typeid(std::make_shared<StreamOrderedAllocator>(arglist)));
+  EXPECT_EQ(std::string(resource->gxf_typename()), "nvidia::gxf::StreamOrderedAllocator"s);
+  EXPECT_TRUE(resource->description().find("name: " + name) != std::string::npos);
+}
+
+TEST_F(ResourceClassesWithGXFContext, TestStreamOrderedAllocatorDefaultConstructor) {
+  auto resource = F.make_resource<StreamOrderedAllocator>();
+}
+
 TEST_F(ResourceClassesWithGXFContext, TestDoubleBufferReceiver) {
   const std::string name{"receiver"};
   ArgList arglist{
@@ -380,7 +421,7 @@ TEST_F(ResourceClassesWithGXFContext, TestUcxReceiver) {
   auto resource = F.make_resource<UcxReceiver>(name, arglist);
   EXPECT_EQ(resource->name(), name);
   EXPECT_EQ(typeid(resource), typeid(std::make_shared<UcxReceiver>(arglist)));
-  EXPECT_EQ(std::string(resource->gxf_typename()), "nvidia::gxf::UcxReceiver"s);
+  EXPECT_EQ(std::string(resource->gxf_typename()), "holoscan::HoloscanUcxReceiver"s);
   EXPECT_TRUE(resource->description().find("name: " + name) != std::string::npos);
 }
 
@@ -409,7 +450,7 @@ TEST_F(ResourceClassesWithGXFContext, TestUcxTransmitter) {
   auto resource = F.make_resource<UcxTransmitter>(name, arglist);
   EXPECT_EQ(resource->name(), name);
   EXPECT_EQ(typeid(resource), typeid(std::make_shared<UcxTransmitter>(arglist)));
-  EXPECT_EQ(std::string(resource->gxf_typename()), "nvidia::gxf::UcxTransmitter"s);
+  EXPECT_EQ(std::string(resource->gxf_typename()), "holoscan::HoloscanUcxTransmitter"s);
   EXPECT_TRUE(resource->description().find("name: " + name) != std::string::npos);
 }
 
diff --git a/tests/data/validation_frames/video_replayer/cpp_video_replayer.patch b/tests/data/validation_frames/video_replayer/cpp_video_replayer.patch
index 6288ab36..2b63974e 100644
--- a/tests/data/validation_frames/video_replayer/cpp_video_replayer.patch
+++ b/tests/data/validation_frames/video_replayer/cpp_video_replayer.patch
@@ -13,7 +13,7 @@
  class VideoReplayerApp : public holoscan::Application {
   public:
    void compose() override {
-@@ -48,6 +54,9 @@ class VideoReplayerApp : public holoscan::Application {
+@@ -52,6 +58,9 @@ class VideoReplayerApp : public holoscan::Application {
        auto visualizer2 = make_operator<ops::HolovizOp>("holoviz2", from_config("holoviz"));
        add_flow(replayer, visualizer2, {{"output", "receivers"}});
      }
diff --git a/tests/data/validation_frames/video_replayer/python_video_replayer.patch b/tests/data/validation_frames/video_replayer/python_video_replayer.patch
index b5e2a0cf..816a1083 100644
--- a/tests/data/validation_frames/video_replayer/python_video_replayer.patch
+++ b/tests/data/validation_frames/video_replayer/python_video_replayer.patch
@@ -1,37 +1,42 @@
 --- ../examples/video_replayer/python/video_replayer.py	2023-11-16 01:35:53.593301380 +0000
 +++ examples/video_replayer/python/video_replayer_test.py	2023-11-20 03:25:57.853150634 +0000
-@@ -19,7 +19,8 @@
- import sys
+@@ -18,8 +18,13 @@ limitations under the License.
+ import os
  
  from holoscan.core import Application
 -from holoscan.operators import HolovizOp, VideoStreamReplayerOp
-+from holoscan.operators import HolovizOp, VideoStreamReplayerOp, VideoStreamRecorderOp, FormatConverterOp
-+from holoscan.resources import UnboundedAllocator
+-from holoscan.resources import RMMAllocator
++from holoscan.operators import (
++    FormatConverterOp,
++    HolovizOp,
++    VideoStreamRecorderOp,
++    VideoStreamReplayerOp,
++)
++from holoscan.resources import RMMAllocator, UnboundedAllocator
  
  sample_data_path = os.environ.get("HOLOSCAN_INPUT_PATH", "../data")
  
-@@ -50,6 +51,24 @@
+@@ -58,6 +63,23 @@ class VideoReplayerApp(Application):
          # Define the workflow
          self.add_flow(replayer, visualizer, {("output", "receivers")})
  
 +        recorder_format_converter = FormatConverterOp(
-+            self, 
-+            name="recorder_format_converter", 
++            self,
++            name="recorder_format_converter",
 +            in_dtype="rgba8888",
 +            out_dtype="rgb888",
-+            pool=UnboundedAllocator(self, name="pool")
-+        )
-+        recorder = VideoStreamRecorderOp(
-+            self, 
-+            name="recorder",
-+            **self.kwargs("recorder")
++            pool=UnboundedAllocator(self, name="pool"),
 +        )
 +
++        recorder = VideoStreamRecorderOp(self, name="recorder", **self.kwargs("recorder"))
++
 +        visualizer.add_arg(allocator=UnboundedAllocator(self, name="allocator"))
 +
-+        self.add_flow(visualizer, recorder_format_converter, {("render_buffer_output", "source_video")})
++        self.add_flow(
++            visualizer, recorder_format_converter, {("render_buffer_output", "source_video")}
++        )
 +        self.add_flow(recorder_format_converter, recorder)
 +
- 
- if __name__ == "__main__":
-     config_file = os.path.join(os.path.dirname(__file__), "video_replayer.yaml")
+         # Check if the YAML dual_window parameter is set and add a second visualizer in that case
+         dual_window = self.kwargs("dual_window").get("dual_window", False)
+         if dual_window:
\ No newline at end of file
diff --git a/tests/flow_tracking/entity_passthrough.cpp b/tests/flow_tracking/entity_passthrough.cpp
index 7a7cd2e5..c69d0764 100644
--- a/tests/flow_tracking/entity_passthrough.cpp
+++ b/tests/flow_tracking/entity_passthrough.cpp
@@ -59,7 +59,8 @@ class OneOutOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<gxf::Entity>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override {
     auto out_message = gxf::Entity::New(&context);
     op_output.emit(out_message);
 
diff --git a/tests/flow_tracking/flow_tracking_cycle.cpp b/tests/flow_tracking/flow_tracking_cycle.cpp
index 04523617..9beb81db 100644
--- a/tests/flow_tracking/flow_tracking_cycle.cpp
+++ b/tests/flow_tracking/flow_tracking_cycle.cpp
@@ -59,7 +59,8 @@ class OneOutOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<gxf::Entity>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override {
     auto out_message = gxf::Entity::New(&context);
     op_output.emit(out_message);
 
diff --git a/tests/holoinfer/inference/test_core.cpp b/tests/holoinfer/inference/test_core.cpp
index 754c2436..28372fb0 100644
--- a/tests/holoinfer/inference/test_core.cpp
+++ b/tests/holoinfer/inference/test_core.cpp
@@ -118,6 +118,7 @@ void HoloInferTests::setup_specifications() {
                                                                  device_map,
                                                                  temporal_map,
                                                                  activation_map,
+                                                                 batch_sizes,
                                                                  is_engine_path,
                                                                  infer_on_cpu,
                                                                  parallel_inference,
@@ -151,8 +152,8 @@ HoloInfer::InferStatus HoloInferTests::prepare_for_inference() {
     size_t buffer_size =
         std::accumulate(td.second.begin(), td.second.end(), 1, std::multiplies<size_t>());
 
-    db->device_buffer->resize(buffer_size);
-    db->host_buffer.resize(buffer_size);
+    db->device_buffer_->resize(buffer_size);
+    db->host_buffer_->resize(buffer_size);
     inference_specs_->data_per_tensor_.insert({td.first, std::move(db)});
   }
 
diff --git a/tests/holoinfer/inference/test_core.hpp b/tests/holoinfer/inference/test_core.hpp
index 50e94124..9e6533c6 100644
--- a/tests/holoinfer/inference/test_core.hpp
+++ b/tests/holoinfer/inference/test_core.hpp
@@ -62,6 +62,7 @@ class HoloInferTests {
   std::vector<std::string> out_tensor_names = {"m1_infer", "m2_infer"};
 
   std::string model_folder = "../tests/holoinfer/test_models/";
+  std::vector<int32_t> batch_sizes = {1, 1, 1};
 
   std::map<std::string, std::string> model_path_map = {
       {"model_1", model_folder + "identity_model.onnx"},
@@ -172,7 +173,8 @@ class HoloInferTests {
       {30, "TRT backend, Parallel inference on multi-GPU with Input on host"},
       {31, "TRT backend, Parallel inference on multi-GPU with Output on host"},
       {32, "TRT backend, multi rank test (rank 5)"},
-      {33, "TRT backend, multi rank test (rank 9)"}};
+      {33, "TRT backend, multi rank test (rank 9)"},
+      {34, "Torch backend, Basic inference"}};
 };
 
 #endif /* HOLOINFER_INFERENCE_TESTS_HPP */
diff --git a/tests/holoinfer/inference/test_inference.cpp b/tests/holoinfer/inference/test_inference.cpp
index b37a9aec..21fbaa79 100644
--- a/tests/holoinfer/inference/test_inference.cpp
+++ b/tests/holoinfer/inference/test_inference.cpp
@@ -59,41 +59,41 @@ void HoloInferTests::inference_tests() {
   inference_specs_->output_per_model_.insert({"m2_infer", dm});
 
   // Test: TRT backend, Empty input cuda buffer 1
-  auto dbs = inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer->size();
-  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer->resize(0);
-  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer = nullptr;
+  auto dbs = inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer_->size();
+  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer_->resize(0);
+  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer_ = nullptr;
   status = do_inference();
   holoinfer_assert(
       status, test_module, 5, test_identifier_infer.at(5), HoloInfer::holoinfer_code::H_ERROR);
-  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer =
+  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer_ =
       std::make_shared<HoloInfer::DeviceBuffer>();
 
   // Test: TRT backend, Empty input cuda buffer 2
   status = do_inference();
   holoinfer_assert(
       status, test_module, 6, test_identifier_infer.at(6), HoloInfer::holoinfer_code::H_ERROR);
-  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer->resize(dbs);
+  inference_specs_->data_per_tensor_.at("m1_pre_proc")->device_buffer_->resize(dbs);
 
   // Test: TRT backend, Empty output cuda buffer 1
-  dbs = inference_specs_->output_per_model_.at("m2_infer")->device_buffer->size();
-  inference_specs_->output_per_model_.at("m2_infer")->device_buffer->resize(0);
+  dbs = inference_specs_->output_per_model_.at("m2_infer")->device_buffer_->size();
+  inference_specs_->output_per_model_.at("m2_infer")->device_buffer_->resize(0);
   status = do_inference();
   holoinfer_assert(
       status, test_module, 7, test_identifier_infer.at(7), HoloInfer::holoinfer_code::H_ERROR);
 
   // Test: TRT backend, Empty output cuda buffer 2
-  inference_specs_->output_per_model_.at("m2_infer")->device_buffer = nullptr;
+  inference_specs_->output_per_model_.at("m2_infer")->device_buffer_ = nullptr;
   status = do_inference();
   holoinfer_assert(
       status, test_module, 8, test_identifier_infer.at(8), HoloInfer::holoinfer_code::H_ERROR);
-  inference_specs_->output_per_model_.at("m2_infer")->device_buffer =
+  inference_specs_->output_per_model_.at("m2_infer")->device_buffer_ =
       std::make_shared<HoloInfer::DeviceBuffer>();
 
   // Test: TRT backend, Empty output cuda buffer 3
   status = do_inference();
   holoinfer_assert(
       status, test_module, 9, test_identifier_infer.at(9), HoloInfer::holoinfer_code::H_ERROR);
-  inference_specs_->output_per_model_.at("m2_infer")->device_buffer->resize(dbs);
+  inference_specs_->output_per_model_.at("m2_infer")->device_buffer_->resize(dbs);
 
   // Test: TRT backend, Basic end-to-end cuda inference
   status = do_inference();
@@ -133,20 +133,20 @@ void HoloInferTests::inference_tests() {
 
   // Test: TRT backend, Empty host input
   size_t re_dbs = 0;
-  dbs = inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer.size();
-  inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer.resize(re_dbs);
+  dbs = inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer_->size();
+  inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer_->resize(re_dbs);
   status = do_inference();
   holoinfer_assert(
       status, test_module, 15, test_identifier_infer.at(15), HoloInfer::holoinfer_code::H_ERROR);
-  inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer.resize(dbs);
+  inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer_->resize(dbs);
 
   // Test: TRT backend, Empty host output
-  dbs = inference_specs_->output_per_model_.at("m2_infer")->host_buffer.size();
-  inference_specs_->output_per_model_.at("m2_infer")->host_buffer.resize(re_dbs);
+  dbs = inference_specs_->output_per_model_.at("m2_infer")->host_buffer_->size();
+  inference_specs_->output_per_model_.at("m2_infer")->host_buffer_->resize(re_dbs);
   status = do_inference();
   holoinfer_assert(
       status, test_module, 16, test_identifier_infer.at(16), HoloInfer::holoinfer_code::H_ERROR);
-  inference_specs_->output_per_model_.at("m2_infer")->host_buffer.resize(dbs);
+  inference_specs_->output_per_model_.at("m2_infer")->host_buffer_->resize(dbs);
 
   if (use_onnxruntime) {
     // Test: ONNX backend, Basic parallel inference on CPU
@@ -194,26 +194,26 @@ void HoloInferTests::inference_tests() {
                        HoloInfer::holoinfer_code::H_SUCCESS);
 
       // Test: ONNX backend, Empty host input
-      dbs = inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer.size();
-      inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer.resize(0);
+      dbs = inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer_->size();
+      inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer_->resize(0);
       status = do_inference();
       holoinfer_assert(status,
                        test_module,
                        21,
                        test_identifier_infer.at(21),
                        HoloInfer::holoinfer_code::H_ERROR);
-      inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer.resize(dbs);
+      inference_specs_->data_per_tensor_.at("m1_pre_proc")->host_buffer_->resize(dbs);
 
       // Test: ONNX backend, Empty host output
-      dbs = inference_specs_->output_per_model_.at("m2_infer")->host_buffer.size();
-      inference_specs_->output_per_model_.at("m2_infer")->host_buffer.resize(0);
+      dbs = inference_specs_->output_per_model_.at("m2_infer")->host_buffer_->size();
+      inference_specs_->output_per_model_.at("m2_infer")->host_buffer_->resize(0);
       status = do_inference();
       holoinfer_assert(status,
                        test_module,
                        22,
                        test_identifier_infer.at(22),
                        HoloInfer::holoinfer_code::H_ERROR);
-      inference_specs_->output_per_model_.at("m2_infer")->host_buffer.resize(dbs);
+      inference_specs_->output_per_model_.at("m2_infer")->host_buffer_->resize(dbs);
     } else {
       // Test: ONNX backend on ARM, Basic sequential inference on GPU
       infer_on_cpu = false;
@@ -287,6 +287,10 @@ void HoloInferTests::inference_tests() {
                        31,
                        test_identifier_infer.at(31),
                        HoloInfer::holoinfer_code::H_SUCCESS);
+    } else {
+      // make sure the last error is reset, else Torch tests below will fail since they check for
+      // the last error without doing a CUDA call before.
+      cudaGetLastError();
     }
     device_map.at("model_1") = "0";
 
@@ -360,6 +364,48 @@ void HoloInferTests::inference_tests() {
     in_tensor_dimensions["m2_pre_proc"] = original_dim;
   }
 
+  if (use_torch) {
+    // Test: torch backend, Basic inference
+    backend = "torch";
+
+    auto backup_path_map = std::move(model_path_map);
+    auto backup_pre_map = std::move(pre_processor_map);
+    auto backup_infer_map = std::move(inference_map);
+    auto backup_in_tensor_dimensions = std::move(in_tensor_dimensions);
+    auto backup_device_map = std::move(device_map);
+
+    model_path_map = {{"test_model", model_folder + "identity_model.pt"}};
+    pre_processor_map = {{"test_model", {"input"}}};
+    inference_map = {{"test_model", {"output"}}};
+    in_tensor_dimensions = {{"input", {3, 10, 10}}};
+    device_map = {};
+
+    YAML::Node torch_inference;
+    torch_inference["inference"]["input_nodes"]["input"]["dtype"] = "kFloat32";
+    torch_inference["inference"]["input_nodes"]["input"]["dim"] = "3 10 10";
+    torch_inference["inference"]["output_nodes"]["output"]["dtype"] = "kFloat32";
+
+    std::ofstream torch_config_file(model_folder + "identity_model.yaml");
+    torch_config_file << torch_inference;
+    torch_config_file.close();
+
+    status = prepare_for_inference();
+    status = do_inference();
+    holoinfer_assert(status,
+                     test_module,
+                     34,
+                     test_identifier_infer.at(34),
+                     HoloInfer::holoinfer_code::H_SUCCESS);
+
+    // Restore all changes to previous state
+    std::filesystem::remove(model_folder + "identity_model.yaml");
+    model_path_map = std::move(backup_path_map);
+    pre_processor_map = std::move(backup_pre_map);
+    inference_map = std::move(backup_infer_map);
+    in_tensor_dimensions = std::move(backup_in_tensor_dimensions);
+    device_map = std::move(backup_device_map);
+  }
+
   // cleaning engine files
   for (const auto& file : std::filesystem::directory_iterator(model_folder)) {
     if (file.is_regular_file()) {
diff --git a/tests/holoinfer/inference/test_parameters.cpp b/tests/holoinfer/inference/test_parameters.cpp
index eed82b03..3cf19fdd 100644
--- a/tests/holoinfer/inference/test_parameters.cpp
+++ b/tests/holoinfer/inference/test_parameters.cpp
@@ -181,7 +181,7 @@ void HoloInferTests::parameter_setup_test() {
   auto backup_infer_map = std::move(inference_map);
   auto backup_device_map = std::move(device_map);
 
-  model_path_map = {{"test_model", model_folder + "identity_model.pt"}};
+  model_path_map = {{"test_model", model_folder + "model.pt"}};
   pre_processor_map = {{"test_model", {"input_"}}};
   inference_map = {{"test_model", {"output_"}}};
   device_map = {};
@@ -193,14 +193,14 @@ void HoloInferTests::parameter_setup_test() {
       status, test_module, 25, test_identifier_params.at(25), HoloInfer::holoinfer_code::H_ERROR);
 
   // Test: Torch backend, Config file missing
-  std::filesystem::rename(model_folder + "identity_model.onnx", model_folder + "identity_model.pt");
+  std::filesystem::rename(model_folder + "identity_model.pt", model_folder + "model.pt");
   status = create_specifications();
   clear_specs();
   holoinfer_assert(
       status, test_module, 26, test_identifier_params.at(26), HoloInfer::holoinfer_code::H_ERROR);
 
   // Test: Torch backend, Inference node missing in Config file
-  std::ofstream torch_config_file(model_folder + "identity_model.yaml");
+  std::ofstream torch_config_file(model_folder + "model.yaml");
   status = create_specifications();
   clear_specs();
   holoinfer_assert(
@@ -219,7 +219,7 @@ void HoloInferTests::parameter_setup_test() {
       status, test_module, 28, test_identifier_params.at(28), HoloInfer::holoinfer_code::H_ERROR);
 
   // Test: Torch backend, dtype missing in input node in Config file
-  torch_config_file.open(model_folder + "identity_model.yaml", std::ofstream::trunc);
+  torch_config_file.open(model_folder + "model.yaml", std::ofstream::trunc);
   torch_inference["inference"]["input_nodes"]["input"]["id"] = "1";
   std::cout << torch_inference << std::endl;
   torch_config_file << torch_inference;
@@ -230,7 +230,7 @@ void HoloInferTests::parameter_setup_test() {
       status, test_module, 29, test_identifier_params.at(29), HoloInfer::holoinfer_code::H_ERROR);
 
   // Test: Torch backend, Incorrect dtype in config file
-  torch_config_file.open(model_folder + "identity_model.yaml", std::ofstream::trunc);
+  torch_config_file.open(model_folder + "model.yaml", std::ofstream::trunc);
   torch_inference["inference"]["input_nodes"]["input"]["dtype"] = "float";
   torch_config_file << torch_inference;
   torch_config_file.close();
@@ -240,7 +240,7 @@ void HoloInferTests::parameter_setup_test() {
       status, test_module, 30, test_identifier_params.at(30), HoloInfer::holoinfer_code::H_ERROR);
 
   // Test: Torch backend, Output node missing in config file correct
-  torch_config_file.open(model_folder + "identity_model.yaml", std::ofstream::trunc);
+  torch_config_file.open(model_folder + "model.yaml", std::ofstream::trunc);
   torch_inference["inference"]["input_nodes"]["input"]["dtype"] = "kFloat32";
   torch_config_file << torch_inference;
   torch_config_file.close();
@@ -250,8 +250,8 @@ void HoloInferTests::parameter_setup_test() {
       status, test_module, 31, test_identifier_params.at(31), HoloInfer::holoinfer_code::H_ERROR);
 
   // Restore all changes to previous state
-  std::filesystem::remove(model_folder + "identity_model.yaml");
-  std::filesystem::rename(model_folder + "identity_model.pt", model_folder + "identity_model.onnx");
+  std::filesystem::rename(model_folder + "model.pt", model_folder + "identity_model.pt");
+  std::filesystem::remove(model_folder + "model.yaml");
   model_path_map = std::move(backup_path_map);
   pre_processor_map = std::move(backup_pre_map);
   inference_map = std::move(backup_infer_map);
diff --git a/tests/holoinfer/processing/test_core.cpp b/tests/holoinfer/processing/test_core.cpp
index 594dd112..18d4c737 100644
--- a/tests/holoinfer/processing/test_core.cpp
+++ b/tests/holoinfer/processing/test_core.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,7 +65,11 @@ HoloInfer::InferStatus ProcessingTests::setup_processor() {
 }
 
 HoloInfer::InferStatus ProcessingTests::execute_processor() {
-  auto status = holoscan_processor_context_->process(
-      process_operations, processed_map, data_per_tensor, dims_per_tensor);
+  auto status = holoscan_processor_context_->process(process_operations,
+                                                     processed_map,
+                                                     data_per_tensor,
+                                                     dims_per_tensor,
+                                                     process_with_cuda,
+                                                     cuda_stream);
   return status;
 }
diff --git a/tests/holoinfer/processing/test_core.hpp b/tests/holoinfer/processing/test_core.hpp
index a7208b0a..766049eb 100644
--- a/tests/holoinfer/processing/test_core.hpp
+++ b/tests/holoinfer/processing/test_core.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,6 +60,8 @@ class ProcessingTests {
   std::unique_ptr<HoloInfer::ProcessorContext> holoscan_processor_context_;
   HoloInfer::DataMap data_per_tensor;
   std::map<std::string, std::vector<int>> dims_per_tensor;
+  bool process_with_cuda = false;
+  cudaStream_t cuda_stream = 0;
   std::string config_path = "";
 
   const std::map<unsigned int, std::string> test_identifier_process = {
diff --git a/tests/holoinfer/test_models/create_torch_identity_model.py b/tests/holoinfer/test_models/create_torch_identity_model.py
new file mode 100644
index 00000000..5c1bd657
--- /dev/null
+++ b/tests/holoinfer/test_models/create_torch_identity_model.py
@@ -0,0 +1,32 @@
+"""
+SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""  # noqa: E501
+
+import torch
+
+
+class IdentityModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: list[torch.Tensor]):
+        # For rcnn, output in (losses, detections)
+        return (0, [{"output": torch.zeros(1, dtype=torch.float32, device=x[0].device)}])
+
+
+module = IdentityModule()
+script_module = torch.jit.script(module)
+script_module.save("identity_model.pt")
diff --git a/tests/holoinfer/test_models/identity_model.pt b/tests/holoinfer/test_models/identity_model.pt
new file mode 100644
index 00000000..08b8657d
Binary files /dev/null and b/tests/holoinfer/test_models/identity_model.pt differ
diff --git a/tests/operators/operator_classes.cpp b/tests/operators/operator_classes.cpp
index e604fc05..f378bc85 100644
--- a/tests/operators/operator_classes.cpp
+++ b/tests/operators/operator_classes.cpp
@@ -127,8 +127,8 @@ TEST_F(OperatorClassesWithGXFContext, TestFormatConverterOp) {
       Arg{"in_dtype", ""s},
       Arg{"out_tensor_name", "out"s},
       Arg{"out_dtype", "float32"s},
-      Arg{"scale_min", 0.f},
-      Arg{"scale_max", 1.f},
+      Arg{"scale_min", 0.F},
+      Arg{"scale_max", 1.F},
       Arg{"alpha_value", static_cast<uint8_t>(255)},
       Arg{"resize_width", 0},
       Arg{"resize_height", 0},
@@ -178,7 +178,7 @@ TEST_F(OperatorClassesWithGXFContext, TestVideoStreamReplayerOp) {
       Arg{"basename", "racerx"s},
       Arg{"batch_size", static_cast<size_t>(1UL)},
       Arg{"ignore_corrupted_entities", true},
-      Arg{"frame_rate", 0.f},
+      Arg{"frame_rate", 0.F},
       Arg{"realtime", true},
       Arg{"repeat", false},
   };
@@ -221,13 +221,13 @@ TEST_F(OperatorClassesWithGXFContext, TestHolovizOp) {
   ArgList kwargs = F.from_config("holoviz");
 
   std::vector<std::vector<float>> color_lut = {
-      {0.65f, 0.81f, 0.89f, 0.1f},
-      {0.2f, 0.63f, 0.17f, 0.7f},
-      {0.98f, 0.6f, 0.6f, 0.7f},
-      {0.89f, 0.1f, 0.11f, 0.7f},
-      {0.99f, 0.75f, 0.44f, 0.7f},
-      {1.0f, 0.5f, 0.0f, 0.7f},
-      {0.0f, 0.0f, 0.0f, 0.1f},
+      {0.65F, 0.81F, 0.89F, 0.1F},
+      {0.2F, 0.63F, 0.17F, 0.7F},
+      {0.98F, 0.6F, 0.6F, 0.7F},
+      {0.89F, 0.1F, 0.11F, 0.7F},
+      {0.99F, 0.75F, 0.44F, 0.7F},
+      {1.0F, 0.5F, 0.0F, 0.7F},
+      {0.0F, 0.0F, 0.0F, 0.1F},
   };
   kwargs.add(Arg{"color_lut", color_lut});
 
diff --git a/tests/stress/ping_multi_port_test.cpp b/tests/stress/ping_multi_port_test.cpp
index 77177427..25612233 100644
--- a/tests/stress/ping_multi_port_test.cpp
+++ b/tests/stress/ping_multi_port_test.cpp
@@ -28,7 +28,11 @@ class ValueData {
   explicit ValueData(int value) : data_(value) {
     HOLOSCAN_LOG_TRACE("ValueData::ValueData(): {}", data_);
   }
-  ~ValueData() { HOLOSCAN_LOG_TRACE("ValueData::~ValueData(): {}", data_); }
+  ~ValueData() {
+    try {
+      HOLOSCAN_LOG_TRACE("ValueData::~ValueData(): {}", data_);
+    } catch (const std::exception& e) {}
+  }
 
   void data(int value) { data_ = value; }
 
@@ -51,7 +55,8 @@ class PingTxOp : public Operator {
     spec.output<std::shared_ptr<ValueData>>("out2");
   }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = std::make_shared<ValueData>(index_++);
     op_output.emit(value1, "out1");
 
@@ -75,7 +80,8 @@ class PingMxOp : public Operator {
     spec.param(multiplier_, "multiplier", "Multiplier", "Multiply the input by this value", 2);
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = op_input.receive<std::shared_ptr<ValueData>>("in1").value();
     auto value2 = op_input.receive<std::shared_ptr<ValueData>>("in2").value();
 
@@ -107,7 +113,8 @@ class PingRxOp : public Operator {
     spec.input<std::vector<std::shared_ptr<ValueData>>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value_vector =
         op_input.receive<std::vector<std::shared_ptr<ValueData>>>("receivers").value();
 
diff --git a/tests/system/cycle.cpp b/tests/system/cycle.cpp
index 37fd7939..ed19166c 100644
--- a/tests/system/cycle.cpp
+++ b/tests/system/cycle.cpp
@@ -59,7 +59,8 @@ class OneOutOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<gxf::Entity>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override {
     auto out_message = gxf::Entity::New(&context);
     op_output.emit(out_message);
 
diff --git a/tests/system/demosaic_op_app.cpp b/tests/system/demosaic_op_app.cpp
index a8c005f2..9721241e 100644
--- a/tests/system/demosaic_op_app.cpp
+++ b/tests/system/demosaic_op_app.cpp
@@ -45,12 +45,11 @@ class DummyDemosaicApp : public holoscan::Application {
 
     auto cuda_stream_pool = make_resource<CudaStreamPool>("cuda_stream", 0, 0, 0, 1, 5);
     if (explicit_stream_pool_init_) { cuda_stream_pool->initialize(); }
-    bool generate_alpha = false;
-    int32_t out_channels = generate_alpha ? 4 : 3;
+    int32_t out_channels = 3;
     ArgList demosaic_arglist = ArgList{
         Arg("in_tensor_name", tensor_name),
         Arg("out_tensor_name", tensor_name),
-        Arg("generate_alpha", generate_alpha),
+        Arg("generate_alpha", false),
         Arg("bayer_grid_pos", 2),
         Arg("interpolation_mode", 0),
         Arg("pool", make_resource<BlockMemoryPool>("pool", 1, rows_ * columns_ * out_channels, 2)),
diff --git a/tests/system/distributed/distributed_app.cpp b/tests/system/distributed/distributed_app.cpp
index 4bdae263..445a6c80 100644
--- a/tests/system/distributed/distributed_app.cpp
+++ b/tests/system/distributed/distributed_app.cpp
@@ -24,7 +24,7 @@
 
 #include <holoscan/holoscan.hpp>
 
-#include "../env_wrapper.hpp"
+#include "distributed_app_fixture.hpp"
 #include "utility_apps.hpp"
 namespace holoscan {
 
@@ -32,13 +32,15 @@ namespace holoscan {
 // Tests
 ///////////////////////////////////////////////////////////////////////////////
 
-TEST(DistributedApp, TestTwoParallelFragmentsApp) {
+TEST_F(DistributedApp, TestTwoParallelFragmentsApp) {
   auto app = make_application<TwoParallelFragmentsApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("SingleOp fragment1.op: 0 - 10") != std::string::npos)
@@ -49,13 +51,15 @@ TEST(DistributedApp, TestTwoParallelFragmentsApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestTwoMultiInputsOutputsFragmentsApp) {
+TEST_F(DistributedApp, TestTwoMultiInputsOutputsFragmentsApp) {
   auto app = make_application<TwoMultiInputsOutputsFragmentsApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -63,13 +67,15 @@ TEST(DistributedApp, TestTwoMultiInputsOutputsFragmentsApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestTwoMultipleSingleOutputOperatorsApp) {
+TEST_F(DistributedApp, TestTwoMultipleSingleOutputOperatorsApp) {
   auto app = make_application<TwoMultipleSingleOutputOperatorsApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -77,13 +83,15 @@ TEST(DistributedApp, TestTwoMultipleSingleOutputOperatorsApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestTwoMultipleSingleOutputOperatorsBroadcastApp) {
+TEST_F(DistributedApp, TestTwoMultipleSingleOutputOperatorsBroadcastApp) {
   auto app = make_application<TwoMultipleSingleOutputOperatorsBroadcastApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -91,13 +99,15 @@ TEST(DistributedApp, TestTwoMultipleSingleOutputOperatorsBroadcastApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestOneTxBroadcastOneRxTwoInputs) {
+TEST_F(DistributedApp, TestOneTxBroadcastOneRxTwoInputs) {
   auto app = make_application<OneTxBroadcastOneRxTwoInputs>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -105,13 +115,15 @@ TEST(DistributedApp, TestOneTxBroadcastOneRxTwoInputs) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestTwoMultiInputsOutputsFragmentsApp2) {
+TEST_F(DistributedApp, TestTwoMultiInputsOutputsFragmentsApp2) {
   auto app = make_application<TwoMultiInputsOutputsFragmentsApp2>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -119,13 +131,15 @@ TEST(DistributedApp, TestTwoMultiInputsOutputsFragmentsApp2) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestUCXConnectionApp) {
+TEST_F(DistributedApp, TestUCXConnectionApp) {
   auto app = make_application<UCXConnectionApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -133,13 +147,15 @@ TEST(DistributedApp, TestUCXConnectionApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestUCXConnectionApp2) {
+TEST_F(DistributedApp, TestUCXConnectionApp2) {
   auto app = make_application<UCXConnectionApp2>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 10") != std::string::npos)
@@ -147,12 +163,14 @@ TEST(DistributedApp, TestUCXConnectionApp2) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestUCXLinearPipelineApp) {
+TEST_F(DistributedApp, TestUCXLinearPipelineApp) {
   auto app = make_application<UCXLinearPipelineApp>();
 
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("received count: 20") != std::string::npos)
@@ -160,13 +178,15 @@ TEST(DistributedApp, TestUCXLinearPipelineApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestUCXBroadcastApp) {
+TEST_F(DistributedApp, TestUCXBroadcastApp) {
   auto app = make_application<UCXBroadcastApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("Rx fragment3.rx message received count: 10") != std::string::npos)
@@ -177,15 +197,42 @@ TEST(DistributedApp, TestUCXBroadcastApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestUCXBroadCastMultiReceiverApp) {
+TEST_F(DistributedApp, TestUCXBroadCastMultiReceiverAppLocal) {
+  // 'AppDriver::launch_fragments_async()' path will be tested.
   auto app = make_application<UCXBroadCastMultiReceiverApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
+
+  EXPECT_TRUE(log_output.find("RxParam fragment2.rx message received (count: 10, size: 2)") !=
+              std::string::npos)
+      << "=== LOG ===\n"
+      << log_output << "\n===========\n";
+  EXPECT_TRUE(log_output.find("Rx fragment4.rx message received count: 10") != std::string::npos)
+      << "=== LOG ===\n"
+      << log_output << "\n===========\n";
+}
+
+TEST_F(DistributedApp, TestUCXBroadCastMultiReceiverAppWorker) {
+  // With this arguments, this will go through 'AppWorkerServiceImpl::GetAvailablePorts()' path
+  std::vector<std::string> args{"app", "--driver", "--worker", "--fragments=all"};
+  auto app = make_application<UCXBroadCastMultiReceiverApp>(args);
+
+  // capture output so that we can check that the expected value is present
+  testing::internal::CaptureStderr();
+
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
+
+  std::string log_output = testing::internal::GetCapturedStderr();
+
   EXPECT_TRUE(log_output.find("RxParam fragment2.rx message received (count: 10, size: 2)") !=
               std::string::npos)
       << "=== LOG ===\n"
@@ -195,7 +242,7 @@ TEST(DistributedApp, TestUCXBroadCastMultiReceiverApp) {
       << log_output << "\n===========\n";
 }
 
-TEST(DistributedApp, TestDriverTerminationWithConnectionFailure) {
+TEST_F(DistributedApp, TestDriverTerminationWithConnectionFailure) {
   const char* env_orig = std::getenv("HOLOSCAN_MAX_CONNECTION_RETRY_COUNT");
 
   // Set retry count to 1 to save time
@@ -203,8 +250,8 @@ TEST(DistributedApp, TestDriverTerminationWithConnectionFailure) {
   setenv("HOLOSCAN_MAX_CONNECTION_RETRY_COUNT", new_env_var, 1);
 
   // Test that the driver terminates when both the driver and the worker are started but the
-  // connection to the driver from the worker fails (wrong IP address such as '22' which is usually
-  // used for SSH and not bindable so we can safely assume that the connection will fail).
+  // connection to the driver from the worker fails (wrong IP address such as '22' which is
+  // usually used for SSH and not bindable so we can safely assume that the connection will fail).
   //
   // Note:: This test will hang if the port number 22 is bindable.
   const std::vector<std::string> args{
@@ -215,7 +262,9 @@ TEST(DistributedApp, TestDriverTerminationWithConnectionFailure) {
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
   // The driver should terminate after the connection failure (after 1 retry)
 
   std::string log_output = testing::internal::GetCapturedStderr();
diff --git a/tests/system/distributed/distributed_app_fixture.hpp b/tests/system/distributed/distributed_app_fixture.hpp
new file mode 100644
index 00000000..c58b7baa
--- /dev/null
+++ b/tests/system/distributed/distributed_app_fixture.hpp
@@ -0,0 +1,106 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SYSTEM_DISTRIBUTED_DISTRIBUTED_APP_FIXTURE_HPP
+#define SYSTEM_DISTRIBUTED_DISTRIBUTED_APP_FIXTURE_HPP
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <holoscan/holoscan.hpp>
+
+#include "holoscan/core/system/network_utils.hpp"
+
+#include "../env_wrapper.hpp"
+
+namespace holoscan {
+
+///////////////////////////////////////////////////////////////////////////////
+// Network Utility Functions
+///////////////////////////////////////////////////////////////////////////////
+
+static std::vector<int> generate_random_ports(int num_ports, int min_port, int max_port) {
+  std::vector<int> ports;
+  std::unordered_set<int> used_ports;
+
+  // Initialize a random number generator with a seed based on the current time
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> dis(min_port, max_port);
+
+  for (int i = 0; i < num_ports; ++i) {
+    int port = dis(gen);
+    while (used_ports.find(port) != used_ports.end()) { port = dis(gen); }
+    used_ports.insert(port);
+    ports.push_back(port);
+  }
+  return ports;
+}
+
+static bool are_ports_in_vector(const std::vector<int>& ports, const std::vector<int>& vector,
+                                int range = 0) {
+  for (const auto& port : ports) {
+    bool found = false;
+    for (const auto& v : vector) {
+      if (port >= v && port <= v + range) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) { return false; }
+  }
+  return true;
+}
+
+}  // namespace holoscan
+
+class DistributedApp : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    using namespace holoscan;
+
+    log_level_orig_ = log_level();
+    candidates_ = generate_random_ports(1, 11000, 60000);
+    HOLOSCAN_LOG_INFO("candidate port: {}", candidates_[0]);
+    env_var_value_ = fmt::format("{}", candidates_[0]);
+
+    unsetenv("HOLOSCAN_LOG_LEVEL");
+    set_log_level(LogLevel::DEBUG);
+
+    wrapper_ =
+        std::make_unique<EnvVarWrapper>(std::initializer_list<std::pair<std::string, std::string>>{
+            {"HOLOSCAN_EXECUTOR_LOG_LEVEL", "INFO"}, {"HOLOSCAN_UCX_PORTS", env_var_value_}});
+  }
+
+  void TearDown() override {
+    using namespace holoscan;
+    set_log_level(log_level_orig_);
+  }
+
+  holoscan::LogLevel log_level_orig_;
+  std::vector<int> candidates_;
+  std::string env_var_value_;
+  std::unique_ptr<EnvVarWrapper> wrapper_;
+};
+
+#endif /* SYSTEM_DISTRIBUTED_DISTRIBUTED_APP_FIXTURE_HPP */
diff --git a/tests/system/distributed/distributed_demosaic_op_app.cpp b/tests/system/distributed/distributed_demosaic_op_app.cpp
index 9abe46cd..5b8e503a 100644
--- a/tests/system/distributed/distributed_demosaic_op_app.cpp
+++ b/tests/system/distributed/distributed_demosaic_op_app.cpp
@@ -27,8 +27,14 @@
 #include "holoscan/operators/ping_tensor_rx/ping_tensor_rx.hpp"
 #include "holoscan/operators/ping_tensor_tx/ping_tensor_tx.hpp"
 
+#include "../env_wrapper.hpp"
+#include "distributed_app_fixture.hpp"
+#include "utility_apps.hpp"
+
 static HoloscanTestConfig test_config;
 
+class DistributedDemosaicOpApp : public DistributedApp {};
+
 class GenerateAndDemosaicFragment : public holoscan::Fragment {
  public:
   void compose() override {
@@ -46,12 +52,11 @@ class GenerateAndDemosaicFragment : public holoscan::Fragment {
                                                  make_condition<CountCondition>(3));
 
     auto cuda_stream_pool = make_resource<CudaStreamPool>("cuda_stream", 0, 0, 0, 1, 5);
-    bool generate_alpha = false;
-    int32_t out_channels = generate_alpha ? 4 : 3;
+    int32_t out_channels = 3;
     ArgList demosaic_arglist = ArgList{
         Arg("in_tensor_name", tensor_name),
         Arg("out_tensor_name", tensor_name),
-        Arg("generate_alpha", generate_alpha),
+        Arg("generate_alpha", false),
         Arg("bayer_grid_pos", 2),
         Arg("interpolation_mode", 0),
         // The pool size is set to 10 to prevent memory allocation errors during testing.
@@ -112,7 +117,7 @@ class DistributedDummyDemosaicApp : public holoscan::Application {
   }
 };
 
-TEST(DistributedDemosaicOpApp, TestDistributedDummyDemosaicApp) {
+TEST_F(DistributedDemosaicOpApp, TestDistributedDummyDemosaicApp) {
   using namespace holoscan;
 
   auto app = make_application<DistributedDummyDemosaicApp>();
@@ -120,7 +125,9 @@ TEST(DistributedDemosaicOpApp, TestDistributedDummyDemosaicApp) {
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("Graph activation failed") == std::string::npos)
diff --git a/tests/system/distributed/distributed_gxf_operator_app.cpp b/tests/system/distributed/distributed_gxf_operator_app.cpp
index e478c74b..47573446 100644
--- a/tests/system/distributed/distributed_gxf_operator_app.cpp
+++ b/tests/system/distributed/distributed_gxf_operator_app.cpp
@@ -18,6 +18,7 @@
 #include <gtest/gtest.h>
 
 #include <string>
+#include <utility>
 
 #include <holoscan/holoscan.hpp>
 #include <holoscan/core/gxf/gxf_extension_registrar.hpp>
@@ -26,6 +27,10 @@
 
 #include "receive_tensor_gxf.hpp"
 
+#include "../env_wrapper.hpp"
+#include "distributed_app_fixture.hpp"
+#include "utility_apps.hpp"
+
 namespace holoscan {
 
 namespace {
@@ -106,13 +111,17 @@ class GXFOperatorsDistributedApp : public holoscan::Application {
 // Tests
 ///////////////////////////////////////////////////////////////////////////////
 
-TEST(DistributedGXFOperatorApps, TestDistributedAppGXFOperatorReceive) {
+class DistributedGXFOperatorApps : public DistributedApp {};
+
+TEST_F(DistributedGXFOperatorApps, TestDistributedAppGXFOperatorReceive) {
   auto app = make_application<GXFOperatorsDistributedApp>();
 
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("Failed to access in tensor") == std::string::npos)
diff --git a/tests/system/distributed/holoscan_ucx_ports_env.cpp b/tests/system/distributed/holoscan_ucx_ports_env.cpp
index 479bd14c..fb630671 100644
--- a/tests/system/distributed/holoscan_ucx_ports_env.cpp
+++ b/tests/system/distributed/holoscan_ucx_ports_env.cpp
@@ -26,6 +26,8 @@
 #include "../env_wrapper.hpp"
 #include "utility_apps.hpp"
 
+#include "distributed_app_fixture.hpp"
+
 namespace holoscan {
 
 TEST(HOLOSCAN_UCX_PORTS, TestGetPreferredNetworkPorts) {
@@ -66,49 +68,33 @@ TEST(HOLOSCAN_UCX_PORTS, TestGetPreferredNetworkPorts) {
   }
 }
 
-TEST(HOLOSCAN_UCX_PORTS, TestUCXBroadCastMultiReceiverAppLocal) {
+TEST(HOLOSCAN_UCX_PORTS, TestUnusedNetworkPortStartingFromCandidatePort) {
   auto log_level_orig = log_level();
 
   {
-    // When 'HOLOSCAN_UCX_PORTS=50007' and the application requires three ports,
-    // the application should use 50007, 50008, and 50009.
-    EnvVarWrapper wrapper({std::make_pair("HOLOSCAN_LOG_LEVEL", "DEBUG"),
-                           std::make_pair("HOLOSCAN_EXECUTOR_LOG_LEVEL", "INFO"),
-                           std::make_pair("HOLOSCAN_UCX_PORTS", "50007")});
+    auto candidates = generate_random_ports(1, 11000, 60000);
 
     // Unset HOLOSCAN_LOG_LEVEL environment variable so that the log level is not overridden
     unsetenv("HOLOSCAN_LOG_LEVEL");
     // use DEBUG log level to be able to check detailed messages in the output
     set_log_level(LogLevel::DEBUG);
 
-    // Collect three unused network ports starting from 50007 for verification
-    auto unused_ports = get_unused_network_ports(3, 50007, 65535, {}, {50007});
-    EXPECT_EQ(unused_ports.size(), 3);
-    EXPECT_GE(unused_ports[0], 50007);
-    EXPECT_GT(unused_ports[1], unused_ports[0]);
-    EXPECT_GT(unused_ports[2], unused_ports[1]);
-    EXPECT_LE(unused_ports[2], 65535);
-
-    auto verification_str = fmt::format("unused_ports={}", fmt::join(unused_ports, ","));
-
-    // 'AppDriver::launch_fragments_async()' path will be tested.
-    auto app = make_application<UCXBroadCastMultiReceiverApp>();
-
     // capture output so that we can check that the expected value is present
     testing::internal::CaptureStderr();
 
-    app->run();
+    // Collect three unused network ports starting from a candidate port for verification
+    auto unused_ports = get_unused_network_ports(3, 11000, 60000, {}, candidates);
+    auto verification_str = fmt::format("unused_ports={}", fmt::join(unused_ports, ","));
 
     std::string log_output = testing::internal::GetCapturedStderr();
 
-    EXPECT_TRUE(log_output.find(verification_str) != std::string::npos)
+    EXPECT_EQ(unused_ports.size(), 3) << "=== LOG ===\n" << log_output << "\n===========\n";
+    // The unused port should be the same with the candidate port or within the range of the
+    // candidate port.
+    EXPECT_TRUE(are_ports_in_vector(unused_ports, candidates, 100))
         << "=== LOG ===\n"
         << log_output << "\n===========\n";
-    EXPECT_TRUE(log_output.find("RxParam fragment2.rx message received (count: 10, size: 2)") !=
-                std::string::npos)
-        << "=== LOG ===\n"
-        << log_output << "\n===========\n";
-    EXPECT_TRUE(log_output.find("Rx fragment4.rx message received count: 10") != std::string::npos)
+    EXPECT_TRUE(log_output.find(verification_str) != std::string::npos)
         << "=== LOG ===\n"
         << log_output << "\n===========\n";
   }
@@ -117,50 +103,33 @@ TEST(HOLOSCAN_UCX_PORTS, TestUCXBroadCastMultiReceiverAppLocal) {
   set_log_level(log_level_orig);
 }
 
-TEST(HOLOSCAN_UCX_PORTS, TestUCXBroadCastMultiReceiverAppWorker) {
+TEST(HOLOSCAN_UCX_PORTS, TestUnusedNetworkPortFromTwoCandidatePorts) {
   auto log_level_orig = log_level();
 
   {
-    // When 'HOLOSCAN_UCX_PORTS=50101,50105' and the application requires three ports,
-    // the application should use 50101, 50105, and 50106.
-    EnvVarWrapper wrapper({std::make_pair("HOLOSCAN_LOG_LEVEL", "DEBUG"),
-                           std::make_pair("HOLOSCAN_EXECUTOR_LOG_LEVEL", "INFO"),
-                           std::make_pair("HOLOSCAN_UCX_PORTS", "50101,50105")});
+    auto candidates = generate_random_ports(2, 11000, 60000);
 
     // Unset HOLOSCAN_LOG_LEVEL environment variable so that the log level is not overridden
     unsetenv("HOLOSCAN_LOG_LEVEL");
     // use DEBUG log level to be able to check detailed messages in the output
     set_log_level(LogLevel::DEBUG);
 
-    // Collect three unused network ports including port numbers 50101, 50105 for verification
-    auto unused_ports = get_unused_network_ports(3, 50101, 65535, {}, {50101, 50105});
-    EXPECT_EQ(unused_ports.size(), 3);
-    EXPECT_GE(unused_ports[0], 50007);
-    EXPECT_GT(unused_ports[1], unused_ports[0]);
-    EXPECT_GT(unused_ports[2], unused_ports[1]);
-    EXPECT_LE(unused_ports[2], 65535);
-
-    auto verification_str = fmt::format("unused_ports={}", fmt::join(unused_ports, ","));
-
-    // With this arguments, this will go through 'AppWorkerServiceImpl::GetAvailablePorts()' path
-    std::vector<std::string> args{"app", "--driver", "--worker", "--fragments=all"};
-    auto app = make_application<UCXBroadCastMultiReceiverApp>(args);
-
     // capture output so that we can check that the expected value is present
     testing::internal::CaptureStderr();
 
-    app->run();
+    // Collect three unused network ports starting from candidate ports for verification
+    auto unused_ports = get_unused_network_ports(3, 11000, 60000, {}, candidates);
+    auto verification_str = fmt::format("unused_ports={}", fmt::join(unused_ports, ","));
 
     std::string log_output = testing::internal::GetCapturedStderr();
 
-    EXPECT_TRUE(log_output.find(verification_str) != std::string::npos)
-        << "=== LOG ===\n"
-        << log_output << "\n===========\n";
-    EXPECT_TRUE(log_output.find("RxParam fragment2.rx message received (count: 10, size: 2)") !=
-                std::string::npos)
+    EXPECT_EQ(unused_ports.size(), 3) << "=== LOG ===\n" << log_output << "\n===========\n";
+    // The unused port should be the same with the candidate port or within the range of the
+    // candidate port.
+    EXPECT_TRUE(are_ports_in_vector(unused_ports, candidates, 100))
         << "=== LOG ===\n"
         << log_output << "\n===========\n";
-    EXPECT_TRUE(log_output.find("Rx fragment4.rx message received count: 10") != std::string::npos)
+    EXPECT_TRUE(log_output.find(verification_str) != std::string::npos)
         << "=== LOG ===\n"
         << log_output << "\n===========\n";
   }
diff --git a/tests/system/distributed/ping_message_rx_op.cpp b/tests/system/distributed/ping_message_rx_op.cpp
index ce411741..10d9dbbd 100644
--- a/tests/system/distributed/ping_message_rx_op.cpp
+++ b/tests/system/distributed/ping_message_rx_op.cpp
@@ -88,7 +88,8 @@ void PingMessageRxOp::setup(OperatorSpec& spec) {
   }
 }
 
-void PingMessageRxOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void PingMessageRxOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                              [[maybe_unused]] ExecutionContext& context) {
   // NOTE: Values in PingMessageRxOp::compute and PingMessageTxOp::compute must remain consistent.
   //       If any value is changed in PingMessageTxOp, please also update the check here.
   bool valid_value = false;
diff --git a/tests/system/distributed/ping_message_rx_op.hpp b/tests/system/distributed/ping_message_rx_op.hpp
index a392cd06..9c928a73 100644
--- a/tests/system/distributed/ping_message_rx_op.hpp
+++ b/tests/system/distributed/ping_message_rx_op.hpp
@@ -76,7 +76,8 @@ class PingMessageRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 
  private:
   MessageType type_ = MessageType::FLOAT;
diff --git a/tests/system/distributed/ping_message_tx_op.cpp b/tests/system/distributed/ping_message_tx_op.cpp
index 7631c048..b5fe2b22 100644
--- a/tests/system/distributed/ping_message_tx_op.cpp
+++ b/tests/system/distributed/ping_message_tx_op.cpp
@@ -88,7 +88,8 @@ void PingMessageTxOp::setup(OperatorSpec& spec) {
   }
 }
 
-void PingMessageTxOp::compute(InputContext&, OutputContext& op_output, ExecutionContext&) {
+void PingMessageTxOp::compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+                              [[maybe_unused]] ExecutionContext& context) {
   // NOTE: Values in PingMessageTxOp::compute and PingMessageRxOp::compute must remain consistent.
   //       If any value is changed here, please make the corresponding change in PingMessageRxOp.
 
diff --git a/tests/system/distributed/ping_message_tx_op.hpp b/tests/system/distributed/ping_message_tx_op.hpp
index 1a792baa..2b1ea076 100644
--- a/tests/system/distributed/ping_message_tx_op.hpp
+++ b/tests/system/distributed/ping_message_tx_op.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -35,7 +35,8 @@ class PingMessageTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 
  private:
   MessageType type_ = MessageType::FLOAT;
diff --git a/tests/system/distributed/standalone_fragments.cpp b/tests/system/distributed/standalone_fragments.cpp
index f6f7eb60..2f681801 100644
--- a/tests/system/distributed/standalone_fragments.cpp
+++ b/tests/system/distributed/standalone_fragments.cpp
@@ -24,6 +24,8 @@
 
 #include <holoscan/holoscan.hpp>
 
+#include "distributed_app_fixture.hpp"
+
 namespace holoscan {
 
 namespace {
@@ -36,7 +38,8 @@ class DummyOp : public Operator {
 
   void setup(OperatorSpec& spec) override {}
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     HOLOSCAN_LOG_INFO("Operator: {}, Index: {}", name(), index_);
     // Sleep for 100ms to simulate some work
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -92,7 +95,7 @@ class StandaloneFragmentApp : public holoscan::Application {
 // Tests
 ///////////////////////////////////////////////////////////////////////////////
 
-TEST(DistributedApp, TestStandaloneFragments) {
+TEST_F(DistributedApp, TestStandaloneFragments) {
   // Test that two fragments can be run independently in a distributed app (issue 4616519).
   const std::vector<std::string> args{"test_app", "--driver", "--worker", "--fragments", "all"};
   auto app = make_application<StandaloneFragmentApp>(args);
@@ -100,7 +103,9 @@ TEST(DistributedApp, TestStandaloneFragments) {
   // capture output so that we can check that the expected value is present
   testing::internal::CaptureStderr();
 
-  app->run();
+  try {
+    app->run();
+  } catch (const std::exception& e) { HOLOSCAN_LOG_ERROR("Exception: {}", e.what()); }
 
   std::string log_output = testing::internal::GetCapturedStderr();
   EXPECT_TRUE(log_output.find("Operator: tx, Index: 10") != std::string::npos)
diff --git a/tests/system/distributed/utility_apps.hpp b/tests/system/distributed/utility_apps.hpp
index 31da4226..7a910265 100644
--- a/tests/system/distributed/utility_apps.hpp
+++ b/tests/system/distributed/utility_apps.hpp
@@ -42,7 +42,8 @@ class SingleOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.param(id_, "id", "id", "id", 0L); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     HOLOSCAN_LOG_INFO("SingleOp {}.{}: {} - {}", fragment()->name(), name(), id_.get(), index_++);
     // sleep for 0.1 seconds
     std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -63,7 +64,8 @@ class PingTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<gxf::Entity>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override {
     auto out_message = gxf::Entity::New(&context);
 
     op_output.emit(out_message);
@@ -82,7 +84,8 @@ class PingTxTwoOutputsOp : public Operator {
     spec.output<gxf::Entity>("out2");
   }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override {
     auto out_message1 = gxf::Entity::New(&context);
     auto out_message2 = gxf::Entity::New(&context);
 
@@ -108,7 +111,7 @@ class PingMxOp : public Operator {
     auto in_message = op_input.receive<gxf::Entity>("in");
     auto out_message = gxf::Entity::New(&context);
 
-    // TODO: Send a tensor. For now, the output message is just an empty Entity.
+    // TODO(unknown): Send a tensor. For now, the output message is just an empty Entity.
     // out_message.add(tensor, "tensor");
 
     op_output.emit(out_message);
@@ -124,7 +127,8 @@ class PingRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.input<gxf::Entity>("in"); }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto in_message = op_input.receive<gxf::Entity>("in");
     HOLOSCAN_LOG_INFO("Rx {}.{} message received count: {}", fragment()->name(), name(), count_++);
   };
@@ -144,7 +148,8 @@ class PingRxTwoInputsOp : public Operator {
     spec.input<gxf::Entity>("in2");
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto in_message1 = op_input.receive<gxf::Entity>("in1");
     auto in_message2 = op_input.receive<gxf::Entity>("in2");
     HOLOSCAN_LOG_INFO("Rx {}.{} message received count: {}", fragment()->name(), name(), count_++);
@@ -164,7 +169,8 @@ class PingMultiReceiversParamRxOp : public Operator {
     spec.input<std::vector<gxf::Entity>>("receivers", IOSpec::kAnySize);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value_vector = op_input.receive<std::vector<gxf::Entity>>("receivers").value();
     HOLOSCAN_LOG_INFO("RxParam {}.{} message received (count: {}, size: {})",
                       fragment()->name(),
diff --git a/tests/system/exception_handling.cpp b/tests/system/exception_handling.cpp
index aa764b85..7f9465df 100644
--- a/tests/system/exception_handling.cpp
+++ b/tests/system/exception_handling.cpp
@@ -85,7 +85,8 @@ class MinimalThrowOp : public Operator {
     }
   };
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     if (throw_type_.get() == ThrowMethod::kCompute) {
       throw std::runtime_error("Exception occurred in MinimalThrowOp::compute");
     }
diff --git a/tests/system/multi_receiver_operator_ping_app.cpp b/tests/system/multi_receiver_operator_ping_app.cpp
index 17cab5a3..ae0cc60e 100644
--- a/tests/system/multi_receiver_operator_ping_app.cpp
+++ b/tests/system/multi_receiver_operator_ping_app.cpp
@@ -59,8 +59,9 @@ class PingTxOp : public holoscan::Operator {
     spec.output<std::shared_ptr<ValueData>>("out2");
   }
 
-  void compute(holoscan::InputContext&, holoscan::OutputContext& op_output,
-               holoscan::ExecutionContext&) override {
+  void compute([[maybe_unused]] holoscan::InputContext& op_input,
+               holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     auto value1 = std::make_shared<ValueData>(index_++);
     op_output.emit(value1, "out1");
 
@@ -81,8 +82,9 @@ class PingNullSharedPtrTxOp : public holoscan::Operator {
     spec.output<std::shared_ptr<ValueData>>("out2");
   }
 
-  void compute(holoscan::InputContext&, holoscan::OutputContext& op_output,
-               holoscan::ExecutionContext&) override {
+  void compute([[maybe_unused]] holoscan::InputContext& op_input,
+               holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     auto value1 = std::make_shared<ValueData>(index_++);
     op_output.emit(nullptr, "out1");
 
@@ -103,8 +105,9 @@ class PingRawNullPtrTxOp : public holoscan::Operator {
     spec.output<const char*>("out2");
   }
 
-  void compute(holoscan::InputContext&, holoscan::OutputContext& op_output,
-               holoscan::ExecutionContext&) override {
+  void compute([[maybe_unused]] holoscan::InputContext& op_input,
+               holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     static const char values[] = {
         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
     op_output.emit(nullptr, "out1");
@@ -135,8 +138,8 @@ class PingTensorMapTxOp : public holoscan::Operator {
     Operator::initialize();
   }
 
-  void compute(holoscan::InputContext&, holoscan::OutputContext& op_output,
-               holoscan::ExecutionContext& context) override {
+  void compute([[maybe_unused]] holoscan::InputContext& op_input,
+               holoscan::OutputContext& op_output, holoscan::ExecutionContext& context) override {
     const nvidia::gxf::Shape out_shape1{1, 2, 3};
     const nvidia::gxf::Shape out_shape2{3, 2, 1};
     const nvidia::gxf::Shape out_shape3{2, 3, 4};
@@ -221,7 +224,7 @@ class PingMxOp : public holoscan::Operator {
   }
 
   void compute(holoscan::InputContext& op_input, holoscan::OutputContext& op_output,
-               holoscan::ExecutionContext&) override {
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     auto value1 = op_input.receive<std::shared_ptr<ValueData>>("in1").value();
     auto value2 = op_input.receive<std::shared_ptr<ValueData>>("in2").value();
 
@@ -263,8 +266,9 @@ class PingRxOp : public holoscan::Operator {
     }
   }
 
-  void compute(holoscan::InputContext& op_input, holoscan::OutputContext&,
-               holoscan::ExecutionContext&) override {
+  void compute(holoscan::InputContext& op_input,
+               [[maybe_unused]] holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     if (should_receive_vector_) {
       auto value_vector =
           op_input.receive<std::vector<std::shared_ptr<ValueData>>>("receivers").value();
@@ -323,8 +327,9 @@ class PingRawPtrRxOp : public holoscan::Operator {
     }
   }
 
-  void compute(holoscan::InputContext& op_input, holoscan::OutputContext&,
-               holoscan::ExecutionContext&) override {
+  void compute(holoscan::InputContext& op_input,
+               [[maybe_unused]] holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     if (should_receive_vector_) {
       auto maybe_value_vector = op_input.receive<std::vector<const char*>>("receivers");
 
@@ -389,8 +394,9 @@ class PingTensorMapRxOp : public holoscan::Operator {
     }
   }
 
-  void compute(holoscan::InputContext& op_input, holoscan::OutputContext&,
-               holoscan::ExecutionContext&) override {
+  void compute(holoscan::InputContext& op_input,
+               [[maybe_unused]] holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     if (should_receive_vector_) {
       auto value_vector = op_input.receive<std::vector<holoscan::TensorMap>>("receivers").value();
 
@@ -452,8 +458,9 @@ class PingEntityRxOp : public holoscan::Operator {
     }
   }
 
-  void compute(holoscan::InputContext& op_input, holoscan::OutputContext&,
-               holoscan::ExecutionContext&) override {
+  void compute(holoscan::InputContext& op_input,
+               [[maybe_unused]] holoscan::OutputContext& op_output,
+               [[maybe_unused]] holoscan::ExecutionContext& context) override {
     if (should_receive_vector_) {
       auto value_vector = op_input.receive<std::vector<holoscan::gxf::Entity>>("receivers").value();
 
diff --git a/tests/system/native_operator_minimal_app.cpp b/tests/system/native_operator_minimal_app.cpp
index 5a02eca5..252d8e89 100644
--- a/tests/system/native_operator_minimal_app.cpp
+++ b/tests/system/native_operator_minimal_app.cpp
@@ -58,7 +58,8 @@ class MinimalOp : public Operator {
     spec.param(value_, "value", "value", "value stored by the operator", 2.5);
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     HOLOSCAN_LOG_INFO("MinimalOp: count: {}", count_++);
     HOLOSCAN_LOG_INFO("MinimalOp: value: {}", value_.get());
   };
@@ -82,7 +83,8 @@ class ComplexValueParameterOp : public Operator {
                {2.5, -3.0});
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     HOLOSCAN_LOG_INFO("ComplexValueParameterOp: count: {}", count_++);
     auto cval = cplx_value_.get();
     HOLOSCAN_LOG_INFO("ComplexValueParameterOp: value: {}{}{}j",
diff --git a/tests/system/native_operator_ping_app.cpp b/tests/system/native_operator_ping_app.cpp
index 33ef646b..05c1b695 100644
--- a/tests/system/native_operator_ping_app.cpp
+++ b/tests/system/native_operator_ping_app.cpp
@@ -50,7 +50,8 @@ class ForwardTestOp : public Operator {
     spec.output<int>("data");
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("data").value();
     op_output.emit(value, "data");
   }
@@ -70,7 +71,8 @@ class ForwardTestOpTwoOutputs : public Operator {
     spec.output<int>("data2");
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("data").value();
     op_output.emit(value, "data");
     op_output.emit(value, "data2");
@@ -91,7 +93,8 @@ class ForwardTestOpTwoInputs : public Operator {
     spec.output<int>("data");
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("data").value();
     auto value2 = op_input.receive<int>("data2").value();
     op_output.emit(value + value2, "data");
diff --git a/tests/system/operator_metadata_apps.cpp b/tests/system/operator_metadata_apps.cpp
index fdeb2fa8..4ca49679 100644
--- a/tests/system/operator_metadata_apps.cpp
+++ b/tests/system/operator_metadata_apps.cpp
@@ -47,7 +47,8 @@ class PingTxMetadataOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.output<int>("out"); }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     if (is_metadata_enabled()) {
       auto dynamic_metadata = metadata();
       if (num_keys_ == 1) {
@@ -82,7 +83,8 @@ class ForwardOp : public Operator {
     spec.output<int>("out");
   }
 
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("in").value();
     if (is_metadata_enabled()) {
       HOLOSCAN_LOG_INFO("fwd metadata()->size() = {}", metadata()->size());
@@ -101,7 +103,8 @@ class ForwardAddMetadataOp : public Operator {
     spec.input<int>("in");
     spec.output<int>("out");
   }
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("in").value();
     if (is_metadata_enabled()) {
       auto meta = metadata();
@@ -122,7 +125,8 @@ class ForwardAddMetadataOp2 : public Operator {
     spec.input<int>("in");
     spec.output<int>("out");
   }
-  void compute(InputContext& op_input, OutputContext& op_output, ExecutionContext&) override {
+  void compute(InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value = op_input.receive<int>("in").value();
     if (is_metadata_enabled()) {
       auto meta = metadata();
@@ -146,7 +150,8 @@ class PingThreeRxMetadataOp : public Operator {
     spec.input<int>("in3");
   }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = op_input.receive<int>("in1").value();
     auto value2 = op_input.receive<int>("in2").value();
     auto value3 = op_input.receive<int>("in3").value();
@@ -172,7 +177,8 @@ class PingSingleRxMetadataOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.input<int>("in1"); }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = op_input.receive<int>("in1").value();
     auto meta = metadata();
     HOLOSCAN_LOG_INFO("{} metadata has {} keys", name(), meta->size());
@@ -212,7 +218,8 @@ class PingTxTensorMapMetadataOp : public Operator {
     Operator::initialize();
   }
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext& context) override {
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               ExecutionContext& context) override {
     if (is_metadata_enabled()) {
       auto dynamic_metadata = metadata();
       if (num_keys_ == 1) {
@@ -289,7 +296,8 @@ class PingSingleRxTensorMapMetadataOp : public Operator {
 
   void setup(OperatorSpec& spec) override { spec.input<holoscan::TensorMap>("in1"); }
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override {
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override {
     auto value1 = op_input.receive<holoscan::TensorMap>("in1").value();
     auto meta = metadata();
     HOLOSCAN_LOG_INFO("{} metadata has {} keys", name(), meta->size());
diff --git a/tests/system/ping_rx_op.cpp b/tests/system/ping_rx_op.cpp
index aa5f14fc..dafabbaa 100644
--- a/tests/system/ping_rx_op.cpp
+++ b/tests/system/ping_rx_op.cpp
@@ -23,10 +23,11 @@ namespace holoscan {
 namespace ops {
 
 void PingMultiRxOp::setup(OperatorSpec& spec) {
-    spec.input<std::vector<int>>("receivers", IOSpec::kAnySize);
+  spec.input<std::vector<int>>("receivers", IOSpec::kAnySize);
 }
 
-void PingMultiRxOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void PingMultiRxOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                            [[maybe_unused]] ExecutionContext& context) {
   auto value_vector = op_input.receive<std::vector<int>>("receivers").value();
 
   HOLOSCAN_LOG_INFO("Rx message received (count: {}, size: {})", count_++, value_vector.size());
diff --git a/tests/system/ping_rx_op.hpp b/tests/system/ping_rx_op.hpp
index e3c3d576..d3adb779 100644
--- a/tests/system/ping_rx_op.hpp
+++ b/tests/system/ping_rx_op.hpp
@@ -33,7 +33,8 @@ class PingMultiRxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 
  private:
   int count_ = 1;
diff --git a/tests/system/ping_tx_op.cpp b/tests/system/ping_tx_op.cpp
index ecb963df..af26464f 100644
--- a/tests/system/ping_tx_op.cpp
+++ b/tests/system/ping_tx_op.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,7 +27,8 @@ void PingMultiTxOp::setup(OperatorSpec& spec) {
   spec.output<int>("out2");
 }
 
-void PingMultiTxOp::compute(InputContext&, OutputContext& op_output, ExecutionContext&) {
+void PingMultiTxOp::compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+                            [[maybe_unused]] ExecutionContext& context) {
   int value1 = 1;
   op_output.emit(value1, "out1");
 
diff --git a/tests/system/ping_tx_op.hpp b/tests/system/ping_tx_op.hpp
index 1f970ecd..5e9bd172 100644
--- a/tests/system/ping_tx_op.hpp
+++ b/tests/system/ping_tx_op.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,7 +31,8 @@ class PingMultiTxOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext&, OutputContext& op_output, ExecutionContext&) override;
+  void compute([[maybe_unused]] InputContext& op_input, OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 };
 
 }  // namespace ops
diff --git a/tests/system/tensor_compare_op.cpp b/tests/system/tensor_compare_op.cpp
index 29d14702..ae89d08f 100644
--- a/tests/system/tensor_compare_op.cpp
+++ b/tests/system/tensor_compare_op.cpp
@@ -32,7 +32,8 @@ void TensorCompareOp::setup(OperatorSpec& spec) {
   spec.input<TensorMap>("input2");
 }
 
-void TensorCompareOp::compute(InputContext& op_input, OutputContext&, ExecutionContext&) {
+void TensorCompareOp::compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+                              [[maybe_unused]] ExecutionContext& context) {
   auto input1 = op_input.receive<TensorMap>("input1").value();
   if (input1.size() != 1) {
     HOLOSCAN_LOG_ERROR("Expected one tensor at `input1`");
diff --git a/tests/system/tensor_compare_op.hpp b/tests/system/tensor_compare_op.hpp
index 028f0646..bde9cf97 100644
--- a/tests/system/tensor_compare_op.hpp
+++ b/tests/system/tensor_compare_op.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,7 +34,8 @@ class TensorCompareOp : public Operator {
 
   void setup(OperatorSpec& spec) override;
 
-  void compute(InputContext& op_input, OutputContext&, ExecutionContext&) override;
+  void compute(InputContext& op_input, [[maybe_unused]] OutputContext& op_output,
+               [[maybe_unused]] ExecutionContext& context) override;
 };
 
 }  // namespace ops