diff --git a/.Doxyfile b/.Doxyfile
index 066a53c02..dac8a3acc 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -960,16 +960,16 @@ INPUT                  = README.md \
                         include/rppi_logical_operations.h \
                         include/rppi_morphological_transforms.h \
                         include/rppi_statistical_operations.h \
+                        include/rppt_tensor_arithmetic_operations.h \
+                        include/rppt_tensor_audio_augmentations.h \
                         include/rppt_tensor_color_augmentations.h \
                         include/rppt_tensor_data_exchange_operations.h \
                         include/rppt_tensor_effects_augmentations.h \
                         include/rppt_tensor_filter_augmentations.h \
                         include/rppt_tensor_geometric_augmentations.h \
+                        include/rppt_tensor_logical_operations.h \
                         include/rppt_tensor_morphological_operations.h \
-                        include/rppt_tensor_statistical_operations.h \
-                        include/rppt_tensor_arithmetic_operations.h \
-                        include/rppt_tensor_audio_augmentations.h \
-                        include/rppt_tensor_logical_operations.h
+                        include/rppt_tensor_statistical_operations.h
 
 
 # This tag can be used to specify the character encoding of the source files
@@ -2381,7 +2381,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE
+PREDEFINED             = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
index 3a33cebd7..1e11589cd 100644
--- a/.azuredevops/rocm-ci.yml
+++ b/.azuredevops/rocm-ci.yml
@@ -13,6 +13,8 @@ trigger:
   batch: true
   branches:
     include:
+    - develop
+    - mainline
     - master
   paths:
     exclude:
@@ -27,8 +29,9 @@ pr:
   autoCancel: true
   branches:
     include:
-    - master
     - develop
+    - mainline
+    - master
   paths:
     exclude:
     - .github
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16c4251f4..ca19c7eb0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,8 @@
 # Changelog for RPP
 
-Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/rpp/en/latest/).
+Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest)
 
-### RPP 1.8.0 (unreleased)
+## (Unreleased) RPP 1.8.0
 
 ### Changes
 
@@ -25,7 +25,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * CMake - Version `3.22.3`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-### RPP 1.5.0
+### RPP 1.5.0 for ROCm 6.1.1
 
 ### Changes
 
@@ -42,7 +42,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * CMake - Version `3.22.3`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 1.4.0
+## RPP 1.4.0 for ROCm 6.0.0
 
 ### Additions
 
@@ -76,7 +76,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * CMake - Version `3.22.3`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 1.3.0
+## RPP 1.3.0 for ROCm 5.7.1
 
 ### Additions
 
@@ -106,7 +106,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * Boost - Version `1.72`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 1.2.0
+## RPP 1.2.0 for ROCm 5.7.1
 
 ### Additions
 
@@ -137,7 +137,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * Boost - Version `1.72`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 1.1.0
+## RPP 1.1.0 for ROCm 5.7.0
 
 ### Additions
 
@@ -172,7 +172,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * SLES - the Clang package is missing in the latest updates, which means Clang must be manually
   installed.
 
-## RPP 1.0.0
+## RPP 1.0.0 for ROCm 5.7.0
 
 ### Additions
 
@@ -212,7 +212,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * SLES - the Clang package is missing in the latest updates, which means Clang must be manually
   installed.
 
-## RPP 0.99
+## RPP 0.99 for ROCm 5.7.0
 
 ### Additions
 
@@ -241,7 +241,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * Boost - Version `1.72`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 0.98
+## RPP 0.98 for ROCm 5.7.0
 
 ### Additions
 * Dockers
@@ -251,11 +251,11 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 
 * Readme updates
 
-#### Changes
+### Changes
 
 * CMakeList
 
-#### Fixes
+### Fixes
 
 * Minor bugs and warnings
 
@@ -270,7 +270,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * Boost - Version `1.72`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 0.97
+## RPP 0.97 for ROCm 5.7.0
 
 ### Additions
 
@@ -301,7 +301,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 * Boost - Version `1.72`
 * IEEE 754-based half-precision floating-point library - Version `1.12.0`
 
-## RPP 0.96
+## RPP 0.96 for ROCm 5.7.0
 
 ### Additions
 
@@ -334,7 +334,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 
 * RPP is not supported on CentOS 7 and SLES SP2
 
-## RPP 0.95
+## RPP 0.95 for ROCm 5.7.0
 
 ### Additions
 
@@ -368,7 +368,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
 
 * ROCm reorganization: install updates no longer match ROCm specifications
 
-## RPP 0.93
+## RPP 0.93 for ROCm 5.7.0
 
 ### Additions
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7963ff864..df233e5dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,9 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 
 # RPP Default Options
 set(DEFAULT_BUILD_TYPE "Release")
+### RPP_AUDIO_SUPPORT - default = ON, NOTE: support currently only on Ubuntu - user to set to OFF otherwise
+option(RPP_AUDIO_SUPPORT      "Build RPP with Audio Support"         ON)
+option(BUILD_WITH_AMD_ADVANCE "Build RPP for advanced AMD GPU Architecture"    OFF)
 
 # Set message options
 if(NOT WIN32)
@@ -77,6 +80,7 @@ endif()
 if(APPLE)
     set(CMAKE_MACOSX_RPATH 1)
     set(BACKEND "CPU")
+    set(RPP_AUDIO_SUPPORT OFF)
     message("-- ${Magenta}Apple macOS Detected -- GPU Support turned OFF${ColourReset}")
 endif()
 
@@ -134,9 +138,16 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 find_package(HALF REQUIRED)
 include_directories(${HALF_INCLUDE_DIRS})
 
+if (RPP_AUDIO_SUPPORT)
+    add_definitions(-DAUDIO_SUPPORT)        # For compile flags in RPP
+    set(RPP_AUDIO_AUGMENTATIONS_SUPPORT 1)  # For cmakedefine01 in rpp_audio_augmentations_support.h.in
+endif()
+
 message("-- ${Cyan}RPP Developer Options${ColourReset}")
 message("-- ${Cyan}     -D BACKEND=${BACKEND} [Select RPP Backend [options:CPU/OPENCL/HIP](default:HIP)]${ColourReset}")
 message("-- ${Cyan}     -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} [Select RPP build type [options:Debug/Release](default:Release)]${ColourReset}")
+message("-- ${Cyan}     -D RPP_AUDIO_SUPPORT=${RPP_AUDIO_SUPPORT} [Select RPP audio support [options:ON/OFF](default:ON)]${ColourReset}")
+message("-- ${Cyan}     -D BUILD_WITH_AMD_ADVANCE=${BUILD_WITH_AMD_ADVANCE} [Turn ON/OFF Build for AMD advanced GPUs(default:OFF)]${ColourReset}")
 
 # OpenMP
 find_package(OpenMP REQUIRED)
@@ -207,8 +218,23 @@ if("${BACKEND}" STREQUAL "HIP")
     endif()
 
     list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip)
+
+    # Set supported GPU Targets
     set(DEFAULT_AMDGPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102")
-    set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+    if (BUILD_WITH_AMD_ADVANCE)
+        set(DEFAULT_AMDGPU_TARGETS ${DEFAULT_AMDGPU_TARGETS} "gfx1200;gfx1201")
+    endif()
+
+    # Set AMDGPU_TARGETS
+    if(DEFINED ENV{AMDGPU_TARGETS})
+        set(AMDGPU_TARGETS $ENV{AMDGPU_TARGETS} CACHE STRING "List of specific machine types for library to target")
+    elseif(AMDGPU_TARGETS)
+        message("-- ${White}${PROJECT_NAME} -- AMDGPU_TARGETS set with -D option${ColourReset}")
+    else()
+        set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+    endif()
+    message("-- ${White}${PROJECT_NAME} -- AMDGPU_TARGETS: ${AMDGPU_TARGETS}${ColourReset}")
+
     find_package(HIP QUIET)
     if(HIP_FOUND)
         message("-- ${White}${PROJECT_NAME} -- Using HIP - Path:" ${HIP_PATH} "\tVersion:" ${HIP_VERSION} "\tCompiler:" ${HIP_COMPILER}${ColourReset})
@@ -273,6 +299,7 @@ if("${BACKEND}" STREQUAL "CPU")
 endif()
 
 configure_file("${PROJECT_SOURCE_DIR}/include/rpp_backend.h.in" "${PROJECT_BINARY_DIR}/include/rpp_backend.h")
+configure_file("${PROJECT_SOURCE_DIR}/include/rpp_audio_augmentations_support.h.in" "${PROJECT_BINARY_DIR}/include/rpp_audio_augmentations_support.h")
 
 # Enable SIMD for HOST code (in both OpenCL and HIP backends)
 if(NOT DEFINED SIMD_ENABLE)
@@ -294,7 +321,12 @@ message("-- ${White}${PROJECT_NAME} -- Link Libraries: ${LINK_LIBRARY_LIST}${Col
 target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST})
 set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a)
+if(RPP_AUDIO_SUPPORT)
+    target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a)
+    message("-- ${Green}${PROJECT_NAME} set to build with RPP_AUDIO_SUPPORT${ColourReset}")
+else()
+    message("-- ${Yellow}${PROJECT_NAME} set to build without RPP_AUDIO_SUPPORT${ColourReset}")
+endif()
 set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR})
 
 target_include_directories(${PROJECT_NAME}
@@ -335,6 +367,9 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTA
 install(FILES ${PROJECT_BINARY_DIR}/include/rpp_backend.h
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rpp
         COMPONENT dev)
+install(FILES ${PROJECT_BINARY_DIR}/include/rpp_audio_augmentations_support.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rpp
+        COMPONENT dev)
 
 # install Test
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
diff --git a/docs/data/doxygenInputs/lens_img640x480.png b/docs/data/doxygenInputs/lens_img640x480.png
new file mode 100644
index 000000000..897955d77
Binary files /dev/null and b/docs/data/doxygenInputs/lens_img640x480.png differ
diff --git a/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png b/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png
new file mode 100644
index 000000000..d4d5b749b
Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png differ
diff --git a/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png
new file mode 100644
index 000000000..8aef1cbe6
Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png differ
diff --git a/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png b/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png
new file mode 100644
index 000000000..63a52819d
Binary files /dev/null and b/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png differ
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 18d9a73bc..9773637df 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -962,14 +962,16 @@ INPUT                  = ../../README.md \
                         ../../include/rppi_logical_operations.h \
                         ../../include/rppi_morphological_transforms.h \
                         ../../include/rppi_statistical_operations.h \
+                        ../../include/rppt_tensor_arithmetic_operations.h \
+                        ../../include/rppt_tensor_audio_augmentations.h \
                         ../../include/rppt_tensor_color_augmentations.h \
                         ../../include/rppt_tensor_data_exchange_operations.h \
                         ../../include/rppt_tensor_effects_augmentations.h \
                         ../../include/rppt_tensor_filter_augmentations.h \
                         ../../include/rppt_tensor_geometric_augmentations.h \
+                        ../../include/rppt_tensor_logical_operations.h \
                         ../../include/rppt_tensor_morphological_operations.h \
-                        ../../include/rppt_tensor_statistical_operations.h \
-                        ../../include/rppt_tensor_logical_operations.h
+                        ../../include/rppt_tensor_statistical_operations.h
 
 
 # This tag can be used to specify the character encoding of the source files
@@ -2381,7 +2383,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE
+PREDEFINED             = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 221c93045..c316de276 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1 @@
-rocm-docs-core[api_reference]==1.4.0
+rocm-docs-core[api_reference]==1.5.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 8d0f37727..2c9286b18 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -110,7 +110,7 @@ requests==2.28.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.4.0
+rocm-docs-core[api-reference]==1.5.1
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb
diff --git a/include/rpp_audio_augmentations_support.h.in b/include/rpp_audio_augmentations_support.h.in
new file mode 100644
index 000000000..6e8e8c66f
--- /dev/null
+++ b/include/rpp_audio_augmentations_support.h.in
@@ -0,0 +1,6 @@
+#ifndef GUARD_RPP_AUDIO_AUGMENTATIONS_SUPPORT_H_IN
+#define GUARD_RPP_AUDIO_AUGMENTATIONS_SUPPORT_H_IN
+
+#cmakedefine01 RPP_AUDIO_AUGMENTATIONS_SUPPORT
+
+#endif
\ No newline at end of file
diff --git a/include/rppdefs.h b/include/rppdefs.h
index 28876d7f5..6eb025665 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -64,6 +64,7 @@ SOFTWARE.
 const float ONE_OVER_6 = 1.0f / 6;
 const float ONE_OVER_3 = 1.0f / 3;
 const float ONE_OVER_255 = 1.0f / 255;
+const uint MMS_MAX_SCRATCH_MEMORY = 76800000; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training
 
 /******************** RPP typedefs ********************/
 
@@ -136,7 +137,15 @@ typedef enum
     /*! \brief src and dst layout mismatch \ingroup group_rppdefs */
     RPP_ERROR_LAYOUT_MISMATCH           = -18,
     /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
-    RPP_ERROR_INVALID_CHANNELS          = -19
+    RPP_ERROR_INVALID_CHANNELS          = -19,
+    /*! \brief Invalid output tile length (Needs to adhere to function specification.) \ingroup group_rppdefs */
+    RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH    = -20,
+    /*! \brief Shared memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
+    RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE    = -21,
+    /*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
+    RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE    = -22,
+    /*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
+    RPP_ERROR_INVALID_SRC_DIMS          = -23
 } RppStatus;
 
 /*! \brief RPP rppStatus_t type enums
@@ -446,6 +455,16 @@ typedef struct
 
 } RpptRoiLtrb;
 
+/*! \brief RPPT Tensor Channel Offsets struct
+ * \ingroup group_rppdefs
+ */
+typedef struct
+{
+    RppiPoint r;
+    RppiPoint g;
+    RppiPoint b;
+} RpptChannelOffsets;
+
 /*! \brief RPPT Tensor 3D ROI LTFRBB struct
  * \ingroup group_rppdefs
  */
diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index d34bdd1dd..d091f50ba 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -47,8 +47,8 @@ extern "C" {
  *          It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
  *          adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HOST memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HOST memory
@@ -70,8 +70,8 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPt
  *          It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
  *          adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HIP memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HIP memory
@@ -92,8 +92,8 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr
  * \details This function performs the addition operation on a batch of 4D tensors.
  *          It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_add_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HOST memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HOST memory
@@ -113,8 +113,8 @@ RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
  * \details This function performs the addition operation on a batch of 4D tensors.
  *          It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_add_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HIP memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HIP memory
@@ -134,8 +134,8 @@ RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc
  * \details This function performs the subtraction operation on a batch of 4D tensors.
  *          It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HOST memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HOST memory
@@ -155,8 +155,8 @@ RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGener
  * \details This function performs the subtraction operation on a batch of 4D tensors.
  *          It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HIP memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HIP memory
@@ -176,8 +176,8 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
  * \details This function performs the multiplication operation on a batch of 4D tensors.
  *          It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HOST memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HOST memory
@@ -190,15 +190,15 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
  */
-RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
 
 #ifdef GPU_SUPPORT
 /*! \brief Multiply scalar augmentation on HIP backend
  * \details This function performs the multiplication operation on a batch of 4D tensors.
  *          It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
  *          Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HIP memory
  * \param[in] srcGenericDescPtr source tensor descriptor
  * \param[out] dstPtr destination tensor in HIP memory
@@ -226,7 +226,7 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -248,7 +248,7 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -258,6 +258,40 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
 RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Logarithm operation on HOST backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ *          Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ *          Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Logarithm operation on HIP backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ *          Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ *          Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index f6349ae95..db52b073f 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -25,6 +25,8 @@ SOFTWARE.
 #ifndef RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
 #define RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
 
+#ifdef AUDIO_SUPPORT
+
 #include "rpp.h"
 #include "rppdefs.h"
 #ifdef __cplusplus
@@ -46,49 +48,90 @@ extern "C" {
  * \details Non Silent Region Detection augmentation for 1D audio buffer
             \n Finds the starting index and length of non silent region in the audio buffer by comparing the
             calculated short-term power with cutoff value passed
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
- * \param[out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize)
- * \param[out] detectionLengthTensor length of non silent region  (1D tensor in HOST memory, of size batchSize)
- * \param[in] cutOffDB cutOff in dB below which the signal is considered silent
- * \param[in] windowLength window length used for computing short-term power of the signal
- * \param[in] referencePower reference power that is used to convert the signal to dB
- * \param[in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
- * \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
+ * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize)
+ * \param [out] detectionLengthTensor length of non silent region  (1D tensor in HOST memory, of size batchSize)
+ * \param [in] cutOffDB cutOff in dB below which the signal is considered silent
+ * \param [in] windowLength window length used for computing short-term power of the signal
+ * \param [in] referencePower reference power that is used to convert the signal to dB
+ * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
  */
 RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
 
+#ifdef GPU_SUPPORT
+/*! \brief Non Silent Region Detection augmentation on HIP backend
+ * \details Non Silent Region Detection augmentation for 1D audio buffer
+            \n Finds the starting index and length of non silent region in the audio buffer by comparing the
+            calculated short-term power with cutoff value passed
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [out] detectionLengthTensor length of non silent region  (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [in] cutOffDB cutOff in dB below which the signal is considered silent
+ * \param [in] windowLength window length used for computing short-term power of the signal
+ * \param [in] referencePower reference power that is used to convert the signal to dB
+ * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! \brief To Decibels augmentation on HOST backend
- * \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2)
- * \param[in] cutOffDB  minimum or cut-off ratio in dB
- * \param[in] multiplier factor by which the logarithm is multiplied
- * \param[in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
- * \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2)
+ * \param [in] cutOffDB  minimum or cut-off ratio in dB
+ * \param [in] multiplier factor by which the logarithm is multiplied
+ * \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
  */
 RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle);
 
+#ifdef GPU_SUPPORT
+/*! \brief To Decibels augmentation on HIP backend
+ * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcDims source tensor sizes for each element in batch (2D tensor in Pinned/HIP memory, of size batchSize * 2)
+ * \param [in] cutOffDB  minimum or cut-off ratio in dB
+ * \param [in] multiplier factor by which the logarithm is multiplied
+ * \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! \brief Pre Emphasis Filter augmentation on HOST backend
  * \details Pre Emphasis Filter augmentation for audio data
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
- * \param[in] coeffTensor preemphasis coefficient (1D tensor in HOST memory, of size batchSize)
- * \param[in] borderType border value policy
- * \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
+ * \param [in] coeffTensor preemphasis coefficient (1D tensor in HOST memory, of size batchSize)
+ * \param [in] borderType border value policy
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
@@ -97,19 +140,36 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
 
 /*! \brief Down Mixing augmentation on HOST backend
 * \details Down Mixing augmentation for audio data
-* \param[in] srcPtr source tensor in HOST memory
-* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[out] dstPtr destination tensor in HOST memory
-* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
-* \param[in] normalizeWeights bool flag to specify if normalization of weights is needed
-* \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+* \param [in] srcPtr source tensor in HOST memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HOST memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param [in] normalizeWeights bool flag to specify if normalization of weights is needed
+* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
 * \return A <tt> \ref RppStatus</tt> enumeration.
 * \retval RPP_SUCCESS Successful completion.
 * \retval RPP_ERROR* Unsuccessful completion.
 */
 RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
 
+#ifdef GPU_SUPPORT
+/*! \brief Down Mixing augmentation on HIP backend
+* \details Down Mixing augmentation for audio data
+* \param [in] srcPtr source tensor in HIP memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HIP memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HIP/Pinned memory, of size batchSize * 2)
+* \param [in] normalizeWeights bool flag to specify if normalization of weights is needed
+* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+* \return A <tt> \ref RppStatus</tt> enumeration.
+* \retval RPP_SUCCESS Successful completion.
+* \retval RPP_ERROR* Unsuccessful completion.
+*/
+RppStatus rppt_down_mixing_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! \brief Produces a spectrogram from a 1D audio buffer on HOST backend
  * \details Spectrogram for 1D audio buffer
  * \param [in] srcPtr source tensor in HOST memory
@@ -153,15 +213,15 @@ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
 
 /*! \brief Resample augmentation on HOST backend
 * \details Resample augmentation for audio data
-* \param[in] srcPtr source tensor in HOST memory
-* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[out] dstPtr destination tensor in HOST memory
-* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize)
-* \param[in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize)
-* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
-* \param[in] window Resampling window (struct of type RpptRpptResamplingWindow)
-* \param[in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+* \param [in] srcPtr source tensor in HOST memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HOST memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param [in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize)
+* \param [in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param [in] window Resampling window (struct of type RpptRpptResamplingWindow)
+* \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
 * \return A <tt> \ref RppStatus</tt> enumeration.
 * \retval RPP_SUCCESS Successful completion.
 * \retval RPP_ERROR* Unsuccessful completion.
@@ -174,4 +234,7 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
 #ifdef __cplusplus
 }
 #endif
+
+#endif // AUDIO_SUPPORT
+
 #endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h
index b01a12dca..62ef13715 100644
--- a/include/rppt_tensor_color_augmentations.h
+++ b/include/rppt_tensor_color_augmentations.h
@@ -54,7 +54,7 @@ extern "C" {
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] alphaTensor alpha values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch)
  * \param [in] betaTensor beta values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -76,7 +76,7 @@ RppStatus rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] alphaTensor alpha values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch)
  * \param [in] betaTensor beta values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -97,7 +97,7 @@ RppStatus rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in HOST memory, of size batchSize with gamma >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -118,7 +118,7 @@ RppStatus rppt_gamma_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rp
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in pinned/HOST memory, of size batchSize with gamma >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -141,7 +141,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] alphaTensor alpha values for alpha-blending (1D tensor in HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -164,7 +164,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] alphaTensor alpha values for alpha-blending (1D tensor in pinned/HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -188,7 +188,7 @@ RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc
  * \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
  * \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
  * \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -212,7 +212,7 @@ RppStatus rppt_color_twist_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  * \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
  * \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
  * \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -236,7 +236,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] contrastTensor contrast modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
  * \param [in] hueTensor hue modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
  * \param [in] saturationTensor saturation modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -257,7 +257,7 @@ RppStatus rppt_color_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n].<R/G/B> <= 255 for each image in batch)
  * \param [in] alphaTensor alpha values for color casting calculation (1D tensor in HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -279,7 +279,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in pinned/HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n].<R/G/B> <= 255 for each image in batch)
  * \param [in] alphaTensor alpha values for color casting calculation (1D tensor in pinned/HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -300,7 +300,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -321,7 +321,7 @@ RppStatus rppt_exposure_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in pinned/HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -343,7 +343,7 @@ RppStatus rppt_exposure_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch))
  * \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -365,7 +365,7 @@ RppStatus rppt_contrast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch))
  * \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -386,7 +386,7 @@ RppStatus rppt_contrast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] lutPtr lut Array in HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -407,7 +407,7 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] lutPtr lut Array in pinned/HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -427,15 +427,15 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr,
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
  * \retval RPP_SUCCESS Successful completion.
  * \retval RPP_ERROR* Unsuccessful completion.
  */
-RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp8s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 
 #ifdef GPU_SUPPORT
 /*! \brief Color Temperature augmentation on HIP backend for a NCHW/NHWC layout tensor
@@ -448,8 +448,8 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h
index 708f318bf..a4c2b41ba 100644
--- a/include/rppt_tensor_effects_augmentations.h
+++ b/include/rppt_tensor_effects_augmentations.h
@@ -56,7 +56,7 @@ extern "C" {
  * \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch)
  * \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch)
  * \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -80,7 +80,7 @@ RppStatus rppt_gridmask_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
  * \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch)
  * \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch)
  * \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -103,7 +103,7 @@ RppStatus rppt_gridmask_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -126,7 +126,7 @@ RppStatus rppt_spatter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -151,7 +151,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
  * \param [in] saltValueTensor A user-defined salt noise value (1D tensor in HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch)
  * \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch)
  * \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -176,7 +176,7 @@ RppStatus rppt_salt_and_pepper_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt
  * \param [in] saltValueTensor A user-defined salt noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch)
  * \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch)
  * \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -198,7 +198,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch)
  * \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -220,7 +220,7 @@ RppStatus rppt_shot_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in pinned/HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch)
  * \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -243,7 +243,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch)
  * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
  * \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -266,7 +266,7 @@ RppStatus rppt_gaussian_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP
  * \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch)
  * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
  * \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -289,7 +289,7 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -312,7 +312,7 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -338,7 +338,7 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDesc
  * \param[in] freqYTensor freqY values for water effect (1D tensor in HOST memory, of size batchSize)
  * \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
  * \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -364,7 +364,7 @@ RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
  * \param[in] freqYTensor freqY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
  * \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
  * \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -433,7 +433,7 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -455,7 +455,7 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -465,11 +465,55 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
 RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *vignetteIntensityTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/******************** jitter ********************/
+
+/*! \brief Jitter augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input
+ * \image html effects_augmentations_jitter_img150x150.png Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Jitter augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input
+ * \image html effects_augmentations_jitter_img150x150.png Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param un[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in]  kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! \brief  Gaussian noise augmentation on HOST backend
  * \details This function adds gaussian noise to a batch of 4D tensors.
  *          Support added for u8 -> u8, f32 -> f32 datatypes.
- * \image html input150x150x4.gif Sample Input
- * \image html effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HOST memory
  * \param [in] srcGenericDescPtr source tensor descriptor
  * \param [out] dstPtr destination tensor in HOST memory
@@ -490,8 +534,8 @@ RppStatus rppt_gaussian_noise_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr src
 /*! \brief  Gaussian noise augmentation on HIP backend
  * \details This function adds gaussian noise to a batch of 4D tensors.
  *          Support added for u8 -> u8, f32 -> f32 datatypes.
- * \image html input150x150x4.gif Sample Input
- * \image html effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HIP memory
  * \param [in] srcGenericDescPtr source tensor descriptor
  * \param [out] dstPtr destination tensor in HIP memory
@@ -524,7 +568,7 @@ RppStatus rppt_gaussian_noise_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcD
             - Erase-region anchor boxes on each image given by the user must not overlap
  * \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr)
  * \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -549,7 +593,7 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
             - Erase-region anchor boxes on each image given by the user must not overlap
  * \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr)
  * \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -559,6 +603,50 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
 RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptRoiLtrb *anchorBoxInfoTensor, RppPtr_t colorsTensor, Rpp32u *numBoxesTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Glitch augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html effects_augmentations_glitch_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A single set of 3 Rppi point values that applies to all images in the batch.
+ *                        For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height)
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Glitch augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html effects_augmentations_glitch_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A 1D tensor in pinned/HOST memory contains single set of 3 Rppi point values that applies to all images in the batch.
+ *                        For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height)
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/include/rppt_tensor_filter_augmentations.h b/include/rppt_tensor_filter_augmentations.h
index 7ea8d00c6..992631c49 100644
--- a/include/rppt_tensor_filter_augmentations.h
+++ b/include/rppt_tensor_filter_augmentations.h
@@ -57,7 +57,7 @@ extern "C" {
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -83,7 +83,7 @@ RppStatus rppt_box_filter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] stdDevTensor stdDev values for gaussian calculation (1D tensor in pinned/HOST memory, of size batchSize, for each image in batch)
  * \param [in] kernelSize kernel size for gaussian filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h
index a3e6d2d7f..28dd516e6 100644
--- a/include/rppt_tensor_geometric_augmentations.h
+++ b/include/rppt_tensor_geometric_augmentations.h
@@ -52,7 +52,7 @@ extern "C" {
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -72,7 +72,7 @@ RppStatus rppt_crop_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -95,7 +95,7 @@ RppStatus rppt_crop_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
  * \param [in] offsetTensor offset values for normalization (1D tensor in HOST memory, of size batchSize, with offsetTensor[n] <= 0)
  * \param [in] multiplierTensor multiplier values for normalization (1D tensor in HOST memory, of size batchSize, with multiplierTensor[n] > 0)
  * \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -118,7 +118,7 @@ RppStatus rppt_crop_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt
  * \param [in] offsetTensor offset values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with offsetTensor[n] <= 0)
  * \param [in] multiplierTensor multiplier values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with multiplierTensor[n] > 0)
  * \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -140,7 +140,7 @@ RppStatus rppt_crop_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in HOST memory, of size batchSize * 6 for each image in batch)
  * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -162,7 +162,7 @@ RppStatus rppt_warp_affine_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in pinned/HOST memory, of size batchSize * 6 for each image in batch)
  * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -184,7 +184,7 @@ RppStatus rppt_warp_affine_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
  * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -206,7 +206,7 @@ RppStatus rppt_flip_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
  * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -228,7 +228,7 @@ RppStatus rppt_flip_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] dstImgSizes destination image sizes (<tt> \ref RpptImagePatchPtr </tt> type pointer to array, in HOST memory, of size batchSize)
  * \param [in] interpolationType Interpolation type used in <tt> \ref RpptInterpolationType </tt>
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -250,7 +250,7 @@ RppStatus rppt_resize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] dstImgSizes destination image sizes (<tt> \ref RpptImagePatchPtr </tt> type pointer to array, in pinned/HOST memory, of size batchSize)
  * \param [in] interpolationType Interpolation type used in <tt> \ref RpptInterpolationType </tt>
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -275,7 +275,7 @@ RppStatus rppt_resize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
  * \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images))
  * \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)
  * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -300,7 +300,7 @@ RppStatus rppt_resize_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDesc
  * \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images))
  * \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)
  * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -323,7 +323,7 @@ RppStatus rppt_resize_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescP
  * \param [in] dstImgSizes destination image sizes (<tt> \ref RpptImagePatchPtr </tt> type pointer to array, in HOST memory, of size batchSize)
  * \param [in] interpolationType Interpolation type used in <tt> \ref RpptInterpolationType </tt>
  * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -346,7 +346,7 @@ RppStatus rppt_resize_crop_mirror_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
  * \param [in] dstImgSizes destination image sizes (<tt> \ref RpptImagePatchPtr </tt> type pointer to array, in pinned/HOST memory, of size batchSize)
  * \param [in] interpolationType Interpolation type used in <tt> \ref RpptInterpolationType </tt>
  * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -368,7 +368,7 @@ RppStatus rppt_resize_crop_mirror_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in HOST memory, of size batchSize)
  * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -390,7 +390,7 @@ RppStatus rppt_rotate_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in pinned/HOST memory, of size batchSize)
  * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -412,7 +412,7 @@ RppStatus rppt_rotate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -434,7 +434,7 @@ RppStatus rppt_phase_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -541,8 +541,8 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
 /*! \brief Flip voxel augmentation HOST
  * \details The flip voxel augmentation performs a mask-controlled horizontal/vertical/depth flip on a generic 4D tensor.
             <br> Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \image html input150x150x4.gif Sample Input
- * \image html geometric_augmentations_flip_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/geometric_augmentations_flip_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HOST memory
  * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
@@ -564,8 +564,8 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
 /*! \brief Flip voxel augmentation GPU
  * \details The flip voxel augmentation performs a mask-controlled horizontal/vertical/depth flip on a generic 4D tensor.
             <br> Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \image html input150x150x4.gif Sample Input
- * \image html geometric_augmentations_flip_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/geometric_augmentations_flip_150x150x4.gif Sample Output
  * \param [in] srcPtr source tensor in HIP memory
  * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
@@ -598,7 +598,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc
  * \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc)
  * \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
  * \param [in] interpolationType Interpolation type used in <tt> \ref RpptInterpolationType </tt> (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -623,7 +623,7 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
  * \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc)
  * \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
  * \param [in] interpolationType Interpolation type used in <tt> \ref RpptInterpolationType </tt> (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -634,6 +634,94 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
 RppStatus rppt_remap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, RpptInterpolationType interpolationType, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
 
+/*! \brief Lens correction transformation on HOST backend for a NCHW/NHWC layout tensor
+ * \details Performs lens correction transforms on an image to compensate barrel lens distortion of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * Note: Returns a black image if the passed camera matrix has a 0 determinant
+ * \image html lens_img640x480.png Sample Input
+ * \image html geometric_augmentations_lens_correction_img_640x480.png Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rowRemapTable Rpp32f row numbers in HOST memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
+ * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize)
+ * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_lens_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, Rpp32f *cameraMatrixTensor, Rpp32f *distortionCoeffsTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Lens correction transformation on HIP backend for a NCHW/NHWC layout tensor
+ * \details Performs lens correction transforms on an image to compensate barrel lens distortion of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.<br>
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * Note: Returns a black image if the passed camera matrix has a 0 determinant
+ * \image html lens_img640x480.png  Sample Input
+ * \image html geometric_augmentations_lens_correction_img_640x480.png Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rowRemapTable Rpp32f row numbers in HIP memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
+ * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize)
+ * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_lens_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, Rpp32f *cameraMatrixTensor, Rpp32f *distortionCoeffsTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Transpose Generic augmentation on HOST backend
+ * \details The transpose augmentation performs an input-permutation based transpose on a generic ND Tensor.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr source tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] permTensor permutation tensor for transpose operation
+ * \param [in] roiTensor ROI data for each element in source tensor (tensor of batchSize * number of dimensions * 2 values)
+ * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_transpose_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *permTensor, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Transpose Generic augmentation on HIP backend
+ * \details The transpose augmentation performs an input-permutation based transpose on a generic ND Tensor.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr source tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] permTensor permutation tensor for transpose operation in pinned memory
+ * \param [in] roiTensor ROI data for each element in source tensor (tensor of batchSize * number of dimensions * 2 values)
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_transpose_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *permTensor, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
 /*! @}
  */
 
diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h
index 3a4685167..28dff69ce 100644
--- a/include/rppt_tensor_logical_operations.h
+++ b/include/rppt_tensor_logical_operations.h
@@ -54,7 +54,7 @@ extern "C" {
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -76,7 +76,7 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -98,7 +98,7 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HOST memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -120,7 +120,7 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -136,4 +136,4 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
 #ifdef __cplusplus
 }
 #endif
-#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
diff --git a/include/rppt_tensor_morphological_operations.h b/include/rppt_tensor_morphological_operations.h
index eb879af5c..126c4757a 100644
--- a/include/rppt_tensor_morphological_operations.h
+++ b/include/rppt_tensor_morphological_operations.h
@@ -57,7 +57,7 @@ extern "C" {
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -82,7 +82,7 @@ RppStatus rppt_erode_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
  * \param [out] dstPtr destination tensor in HIP memory
  * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
  * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -98,4 +98,4 @@ RppStatus rppt_dilate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
 #ifdef __cplusplus
 }
 #endif
-#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 441816ea3..ca464340b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -50,7 +50,7 @@ extern "C" {
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] tensorSumArr destination array in HOST memory
  * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -68,7 +68,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] tensorSumArr destination array in HIP memory
  * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -86,7 +86,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] minArr destination array in HOST memory
  * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -104,7 +104,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] minArr destination array in HIP memory
  * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -122,7 +122,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] maxArr destination array in HOST memory
  * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -140,7 +140,7 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] maxArr destination array in HIP memory
  * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -201,7 +201,7 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] tensorMeanArr destination array in HOST memory
  * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -219,7 +219,7 @@ RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
  * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
  * \param [out] tensorMeanArr destination array in HIP memory
  * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -238,7 +238,7 @@ RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
  * \param [out] tensorStddevArr destination array in HOST memory
  * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
  * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HOST handle created with <tt>\ref rppCreateWithBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -257,7 +257,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
  * \param [out] tensorStddevArr destination array in HIP memory
  * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
  * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
  * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
  * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
  * \return A <tt> \ref RppStatus</tt> enumeration.
@@ -273,4 +273,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
 #ifdef __cplusplus
 }
 #endif
-#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index bb06713b9..be8eaeeaa 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -177,6 +177,21 @@ struct RPPTensorFunctionMetaData
 };
 #endif // GPU_SUPPORT
 
+// Computes strides for ND Tensor
+inline void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
+{
+    if (tensorDim > 0)
+    {
+        Rpp32u v = 1;
+        for (Rpp32u i = tensorDim - 1; i > 0; i--)
+        {
+            strides[i] = v;
+            v *= shape[i];
+        }
+        strides[0] = v;
+    }
+}
+
 // Uses fast inverse square root algorithm from Lomont, C., 2003. FAST INVERSE SQUARE ROOT. [online] lomont.org. Available at: <http://www.lomont.org/papers/2003/InvSqrt.pdf>
 inline float rpp_host_math_inverse_sqrt_1(float x)
 {
@@ -6096,6 +6111,25 @@ inline void compute_separable_horizontal_resample(Rpp32f *inputPtr, T *outputPtr
     }
 }
 
+inline void compute_jitter_src_loc_avx(__m256i *pxXorwowStateX, __m256i *pxXorwowStateCounter, __m256 &pRow, __m256 &pCol, __m256 &pKernelSize, __m256 &pBound, __m256 &pHeightLimit, __m256 &pWidthLimit, __m256 &pStride, __m256 &pChannel, Rpp32s *srcLoc)
+{
+    __m256 pRngX = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter);
+    __m256 pRngY = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter);
+    __m256 pX = _mm256_mul_ps(pRngX, pKernelSize);
+    __m256 pY = _mm256_mul_ps(pRngY, pKernelSize);
+    pX = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pRow, _mm256_sub_ps(pX, pBound))), pHeightLimit), avx_p0);
+    pY = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pCol, _mm256_sub_ps(pY, pBound))), pWidthLimit), avx_p0);
+    __m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pX, pStride, _mm256_mul_ps(pY, pChannel)));
+    _mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc);
+}
+
+inline void compute_jitter_src_loc(RpptXorwowStateBoxMuller *xorwowState, Rpp32s row, Rpp32s col, Rpp32s kSize, Rpp32s heightLimit, Rpp32s widthLimit, Rpp32s stride, Rpp32s bound, Rpp32s channels, Rpp32s &loc)
+{
+    Rpp32u heightIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize;
+    Rpp32u widthIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize;
+    loc = std::max(std::min(static_cast<int>(row + heightIncrement - bound), heightLimit), 0) * stride;
+    loc += std::max(std::min(static_cast<int>(col + widthIncrement  - bound), (widthLimit - 1)), 0) * channels;
+}
 inline void compute_sum_16_host(__m256i *p, __m256i *pSum)
 {
     pSum[0] = _mm256_add_epi32(_mm256_add_epi32(p[0], p[1]), pSum[0]); //add 16 values to 8
@@ -6501,4 +6535,32 @@ inline void compute_remap_src_loc(Rpp32f rowLoc, Rpp32f colLoc, Rpp32s &srcLoc,
     srcLoc = (rowLoc * stride) + colLoc * channels;
 }
 
+inline void compute_log_16_host(__m256 *p)
+{
+    p[0] = log_ps(p[0]);    // log compute
+    p[1] = log_ps(p[1]);    // log compute
+}
+
+inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst)
+{
+    __m256 tmp0, tmp1, tmp2, tmp3;
+    tmp0 = _mm256_shuffle_ps(pSrc[0], pSrc[1], 0x44);   /* shuffle to get [P01|P02|P09|P10|P05|P06|P13|P14] */
+    tmp2 = _mm256_shuffle_ps(pSrc[0], pSrc[1], 0xEE);   /* shuffle to get [P03|P04|P11|P12|P07|P08|P15|P16] */
+    tmp1 = _mm256_shuffle_ps(pSrc[2], pSrc[3], 0x44);   /* shuffle to get [P17|P18|P25|P26|P21|P22|P29|P30] */
+    tmp3 = _mm256_shuffle_ps(pSrc[2], pSrc[3], 0xEE);   /* shuffle to get [P19|P20|P27|P28|P23|P24|P31|P32] */
+    pSrc[0] = _mm256_shuffle_ps(tmp0, tmp1, 0x88);  /* shuffle to get [P01|P09|P17|P25|P05|P13|P21|P29] */
+    pSrc[1] = _mm256_shuffle_ps(tmp0, tmp1, 0xDD);  /* shuffle to get [P02|P10|P18|P26|P06|P14|P22|P30] */
+    pSrc[2] = _mm256_shuffle_ps(tmp2, tmp3, 0x88);  /* shuffle to get [P03|P11|P19|P27|P07|P15|P23|P31] */
+    pSrc[3] = _mm256_shuffle_ps(tmp2, tmp3, 0xDD);  /* shuffle to get [P04|P12|P20|P28|P08|P16|P24|P32] */
+
+    pDst[0] = _mm256_castps256_ps128(pSrc[0]);  /* extract [P01|P09|P17|P25] */
+    pDst[1] = _mm256_castps256_ps128(pSrc[1]);  /* extract [P02|P10|P18|P26] */
+    pDst[2] = _mm256_castps256_ps128(pSrc[2]);  /* extract [P03|P11|P19|P27] */
+    pDst[3] = _mm256_castps256_ps128(pSrc[3]);  /* extract [P04|P12|P20|P28] */
+    pDst[4] = _mm256_extractf128_ps(pSrc[0], 1);    /* extract [P05|P13|P21|P29] */
+    pDst[5] = _mm256_extractf128_ps(pSrc[1], 1);    /* extract [P06|P14|P22|P30] */
+    pDst[6] = _mm256_extractf128_ps(pSrc[2], 1);    /* extract [P07|P15|P23|P31] */
+    pDst[7] = _mm256_extractf128_ps(pSrc[3], 1);    /* extract [P08|P16|P24|P32] */
+}
+
 #endif //RPP_CPU_COMMON_H
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index babc6f55c..b9e79c146 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -185,6 +185,10 @@ const __m256i avx_pxShufflePkd = _mm256_setr_m128(xmm_pxStore4Pkd, xmm_pxStore4P
 const __m128i xmm_pxMask00 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0, 1, 2, 3);
 const __m128i xmm_pxMask04To11 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
 
+const __m256i avx_pxMaskR = _mm256_setr_epi8(0, 0x80, 0x80, 3, 0x80, 0x80, 6, 0x80, 0x80, 9, 0x80, 0x80, 12, 0x80, 0x80, 15, 0x80, 0x80, 18, 0x80, 0x80, 21, 0x80, 0x80, 24, 0x80, 0x80, 27, 0x80, 0x80, 0x80, 0x80);
+const __m256i avx_pxMaskG = _mm256_setr_epi8(0x80, 1, 0x80, 0x80, 4, 0x80, 0x80, 7, 0x80, 0x80, 10, 0x80, 0x80, 13, 0x80, 0x80, 16, 0x80, 0x80, 19, 0x80, 0x80, 22, 0x80, 0x80, 25, 0x80, 0x80, 28, 0x80, 0x80, 0x80);
+const __m256i avx_pxMaskB = _mm256_setr_epi8(0x80, 0x80, 2, 0x80, 0x80, 5, 0x80, 0x80, 8, 0x80, 0x80, 11, 0x80, 0x80, 14, 0x80, 0x80, 17, 0x80, 0x80, 20, 0x80, 0x80, 23, 0x80, 0x80, 26, 0x80, 0x80, 29, 0x80, 0x80);
+
 // Print helpers
 
 inline void rpp_mm_print_epi8(__m128i vPrintArray)
@@ -1021,6 +1025,99 @@ inline void rpp_load48_u8pkd3_to_f32pln3_avx(Rpp8u *srcPtr, __m256 *p)
     p[5] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMaskB), _mm_shuffle_epi8(px[3], xmm_pxMaskB)));    /* Contains B09-16 */
 }
 
+inline void rpp_glitch_load24_u8pkd3_to_f32pln3_avx(Rpp8u *srcPtr, __m256 *p, int *srcLocs)
+{
+    __m128i px[2];
+    px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0]));      /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need R01-04 */
+    px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need R05-08 */
+    p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskR), _mm_shuffle_epi8(px[1], xmm_pxMaskR)));   /* Contains R01-08 */
+
+    px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1]));      /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need G01-04 */
+    px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need G05-08 */
+    p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskG), _mm_shuffle_epi8(px[1], xmm_pxMaskG)));   /* Contains G01-08 */
+
+    px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2]));      /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need B01-04 */
+    px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need B05-08 */
+    p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskB), _mm_shuffle_epi8(px[1], xmm_pxMaskB)));   /* Contains B01-08 */
+}
+
+inline void rpp_glitch_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p, int *srcLocs)
+{
+    __m128 p128[8];
+    Rpp32f *srcPtrTemp = srcPtr + srcLocs[0];
+    p[0] = _mm256_setr_ps(*srcPtrTemp, *(srcPtrTemp + 3), *(srcPtrTemp + 6), *(srcPtrTemp + 9), 
+                         *(srcPtrTemp + 12), *(srcPtrTemp + 15), *(srcPtrTemp + 18), *(srcPtrTemp + 21));
+    srcPtrTemp = srcPtr + srcLocs[1];
+    p[1] = _mm256_setr_ps(*(srcPtrTemp + 1), *(srcPtrTemp + 4), *(srcPtrTemp + 7), *(srcPtrTemp + 10), 
+                         *(srcPtrTemp + 13), *(srcPtrTemp + 16), *(srcPtrTemp + 19), *(srcPtrTemp + 22));
+    srcPtrTemp = srcPtr + srcLocs[2];
+    p[2] = _mm256_setr_ps(*(srcPtrTemp + 2), *(srcPtrTemp + 5), *(srcPtrTemp + 8), *(srcPtrTemp + 11), 
+                         *(srcPtrTemp + 14), *(srcPtrTemp + 17), *(srcPtrTemp + 20), *(srcPtrTemp + 23));
+}
+
+inline void rpp_glitch_load24_i8pkd3_to_f32pln3_avx(Rpp8s *srcPtr, __m256 *p, int *srcLocs)
+{
+    __m128i px[2];
+    px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0])));      /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need R01-04 */
+    px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need R05-08 */
+    p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskR), _mm_shuffle_epi8(px[1], xmm_pxMaskR)));   /* Contains R01-08 */
+
+    px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1])));      /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need G01-04 */
+    px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need G05-08 */
+    p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskG), _mm_shuffle_epi8(px[1], xmm_pxMaskG)));   /* Contains G01-08 */
+
+    px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2])));      /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need B01-04 */
+    px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need B05-08 */
+    p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskB), _mm_shuffle_epi8(px[1], xmm_pxMaskB)));   /* Contains B01-08 */
+}
+
+inline void rpp_glitch_load30_u8pkd3_to_u8pkd3_avx(Rpp8u *srcPtr, int *srcLocs, __m256i &p)
+{
+    __m256i px[3];
+    px[0] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[0]));   // Load the source location1 values passed
+    px[1] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[1]));   // Load the source location2 values passed
+    px[2] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[2]));   // Load the source location3 values passed
+    px[0] = _mm256_shuffle_epi8(px[0], avx_pxMaskR);    /* Shuffle to obtain R channel values  */
+    px[1] = _mm256_shuffle_epi8(px[1], avx_pxMaskG);    /* Shuffle to obtain G channel values  */
+    px[2] = _mm256_shuffle_epi8(px[2], avx_pxMaskB);    /* Shuffle to obtain B channel values  */
+    px[0] = _mm256_or_si256(px[0], px[1]);  /* Pack R and G channels to obtain RG format */
+    p = _mm256_or_si256(px[0], px[2]);      /* Pack RG values and B channel to obtain RGB format */
+}
+
+inline void rpp_glitch_load30_i8pkd3_to_i8pkd3_avx(Rpp8s *srcPtr, int * srcLocs, __m256i &p)
+{
+    __m256i px[3];
+    px[0] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[0]));   // Load the source location1 values passed
+    px[1] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[1]));   // Load the source location2 values passed
+    px[2] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[2]));   // Load the source location3 values passed
+    px[0] = _mm256_shuffle_epi8(px[0], avx_pxMaskR);    /* Shuffle to obtain R channel values  */
+    px[1] = _mm256_shuffle_epi8(px[1], avx_pxMaskG);    /* Shuffle to obtain G channel values  */
+    px[2] = _mm256_shuffle_epi8(px[2], avx_pxMaskB);    /* Shuffle to obtain B channel values  */
+    px[0] = _mm256_or_si256(px[0], px[1]);  /* Pack R and G channels to obtain RG format */
+    p = _mm256_or_si256(px[0], px[2]);      /* Pack RG values and B channel to obtain RGB format */
+}
+
+inline void rpp_glitch_load6_f32pkd3_to_f32pkd3_avx(Rpp32f *srcPtr, int * srcLocs, __m256 &p)
+{
+    p =_mm256_setr_ps(*(srcPtr + srcLocs[0]), *(srcPtr + srcLocs[1] + 1), *(srcPtr + srcLocs[2] + 2), *(srcPtr + srcLocs[0] + 3), 
+                      *(srcPtr + srcLocs[1] + 4), *(srcPtr + srcLocs[2] + 5), 0.0f, 0.0f);
+}
+
+inline void rpp_glitch_load48_u8pln3_to_f32pln3_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256 *p, int *srcLocs)
+{
+    __m128i px[3];
+
+    px[0] = _mm_loadu_si128((__m128i *)srcPtrR + srcLocs[0]);       /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+    px[1] = _mm_loadu_si128((__m128i *)srcPtrG + srcLocs[1]);       /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+    px[2] = _mm_loadu_si128((__m128i *)srcPtrB + srcLocs[2]);       /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+    p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMask00To03), _mm_shuffle_epi8(px[0], xmm_pxMask04To07)));    /* Contains R01-08 */
+    p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMask08To11), _mm_shuffle_epi8(px[0], xmm_pxMask12To15)));    /* Contains R09-16 */
+    p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[1], xmm_pxMask00To03), _mm_shuffle_epi8(px[1], xmm_pxMask04To07)));    /* Contains G01-08 */
+    p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[1], xmm_pxMask08To11), _mm_shuffle_epi8(px[1], xmm_pxMask12To15)));    /* Contains G09-16 */
+    p[4] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMask00To03), _mm_shuffle_epi8(px[2], xmm_pxMask04To07)));    /* Contains B01-08 */
+    p[5] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMask08To11), _mm_shuffle_epi8(px[2], xmm_pxMask12To15)));    /* Contains B09-16 */
+}
+
 inline void rpp_load48_u8pkd3_to_f32pln3_mirror_avx(Rpp8u *srcPtr, __m256 *p)
 {
     __m128i px[4];
@@ -3762,6 +3859,20 @@ inline void rpp_resize_nn_load_u8pkd3(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _
     p = _mm_shuffle_epi8(px[0], xmm_pkd_mask);    // Shuffle to obtain 4 RGB [R01|G01|B01|R11|G11|B11|R21|G21|B21|R31|G31|B31|00|00|00|00]
 }
 
+template<typename T>
+inline void rpp_resize_nn_extract_pkd3_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p)
+{
+    p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[0] + 1), *(srcRowPtrsForInterp + loc[0] + 2), 
+                         *(srcRowPtrsForInterp + loc[1]), *(srcRowPtrsForInterp + loc[1] + 1), *(srcRowPtrsForInterp + loc[1] + 2), 
+                         *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[2] + 1), *(srcRowPtrsForInterp + loc[2] + 2),
+                         *(srcRowPtrsForInterp + loc[3]), *(srcRowPtrsForInterp + loc[3] + 1), *(srcRowPtrsForInterp + loc[3] + 2),
+                         *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[4] + 1), *(srcRowPtrsForInterp + loc[4] + 2),
+                         *(srcRowPtrsForInterp + loc[5]), *(srcRowPtrsForInterp + loc[5] + 1), *(srcRowPtrsForInterp + loc[5] + 2),
+                         *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[6] + 1), *(srcRowPtrsForInterp + loc[6] + 2),
+                         *(srcRowPtrsForInterp + loc[7]), *(srcRowPtrsForInterp + loc[7] + 1), *(srcRowPtrsForInterp + loc[7] + 2),
+                         0, 0, 0, 0, 0, 0, 0, 0);
+}
+
 inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p)
 {
     __m128i px[4];
@@ -3774,6 +3885,16 @@ inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _
     p = _mm_unpacklo_epi8(px[0], px[1]);    // unpack to obtain [R01|R11|R21|R31|00|00|00|00|00|00|00|00|00|00|00|00]
 }
 
+template<typename T>
+inline void rpp_resize_nn_extract_pln1_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p)
+{
+    p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]), 
+                         *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]),
+                         *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]),
+                         *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]),
+                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
 inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 *p)
 {
     p[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]);  // LOC0 load [R01|G01|B01|R02] - Need RGB 01
@@ -3783,6 +3904,42 @@ inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, R
     _MM_TRANSPOSE4_PS(p[0], p[1], p[2], pTemp); // Transpose to obtain RGB in each vector
 }
 
+inline void rpp_resize_nn_load_f32pkd3_to_f32pln3_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p)
+{
+    __m128 p128[8];
+    p128[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]);
+    p128[1] = _mm_loadu_ps(srcRowPtrsForInterp + loc[1]);
+    p128[2] = _mm_loadu_ps(srcRowPtrsForInterp + loc[2]);
+    p128[3] = _mm_loadu_ps(srcRowPtrsForInterp + loc[3]);
+    _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]);
+    p128[4] = _mm_loadu_ps(srcRowPtrsForInterp + loc[4]);
+    p128[5] = _mm_loadu_ps(srcRowPtrsForInterp + loc[5]);
+    p128[6] = _mm_loadu_ps(srcRowPtrsForInterp + loc[6]);
+    p128[7] = _mm_loadu_ps(srcRowPtrsForInterp + loc[7]);
+    _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]);
+    p[0] = _mm256_setr_m128(p128[0], p128[4]);
+    p[1] = _mm256_setr_m128(p128[1], p128[5]);
+    p[2] = _mm256_setr_m128(p128[2], p128[6]);
+}
+
+inline void rpp_resize_nn_load_f16pkd3_to_f32pln3_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p)
+{
+    p[0] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7]));
+
+    p[1] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 1),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 1),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 1),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 1));
+
+    p[2] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 2),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 2),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 2),
+                          (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 2));
+}
+
 inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 &p)
 {
     __m128 pTemp[4];
@@ -3795,6 +3952,22 @@ inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc,
     p = _mm_unpacklo_ps(pTemp[0], pTemp[1]);    // Unpack to obtain [R01|R11|R21|R31]
 }
 
+inline void rpp_resize_nn_load_f32pln1_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p)
+{
+    p = _mm256_setr_ps(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]),
+                       *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]),
+                       *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]),
+                       *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]));
+}
+
+inline void rpp_resize_nn_load_f16pln1_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p)
+{
+    p = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]),
+                       (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]),
+                       (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]),
+                       (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7]));
+}
+
 inline void rpp_resize_nn_load_i8pkd3(Rpp8s *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p)
 {
     __m128i px[4];
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index 3f32dbc04..721800c80 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -55,7 +55,7 @@ typedef union { float f1[5];
 typedef union { float f1[6];    float2 f2[3];                                                   }   d_float6;
 typedef union { float f1[7];                                                                    }   d_float7;
 typedef union { float f1[8];    float2 f2[4];   float4 f4[2];                                   }   d_float8;
-typedef union { float f1[9];                                                                    }   d_float9;
+typedef union { float f1[9];    float3 f3[3];                                                   }   d_float9;
 typedef union { float f1[12];   float4 f4[3];                                                   }   d_float12;
 typedef union { float f1[16];   float4 f4[4];   d_float8 f8[2];                                 }   d_float16;
 typedef union { float f1[24];   float2 f2[12];  float3 f3[8];   float4 f4[6];   d_float8 f8[3]; }   d_float24;
@@ -1776,6 +1776,22 @@ __device__ __forceinline__ void rpp_hip_math_multiply24_const(d_float24 *src_f24
     dst_f24->f4[5] = src_f24->f4[5] * multiplier_f4;
 }
 
+// d_float8 divide
+
+__device__ __forceinline__ void rpp_hip_math_divide8(d_float8 *src1Ptr_f8, d_float8 *src2Ptr_f8, d_float8 *dstPtr_f8)
+{
+    dstPtr_f8->f4[0] = src1Ptr_f8->f4[0] / src2Ptr_f8->f4[0];
+    dstPtr_f8->f4[1] = src1Ptr_f8->f4[1] / src2Ptr_f8->f4[1];
+}
+
+// d_float8 divide with constant
+
+__device__ __forceinline__ void rpp_hip_math_divide8_const(d_float8 *src_f8, d_float8 *dst_f8, float4 divisor_f4)
+{
+    dst_f8->f4[0] = divisor_f4 / src_f8->f4[0];
+    dst_f8->f4[1] = divisor_f4 / src_f8->f4[1];
+}
+
 // d_float8 bitwiseAND
 
 __device__ __forceinline__ void rpp_hip_math_bitwiseAnd8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
@@ -1869,6 +1885,21 @@ __device__ __forceinline__ float rpp_hip_math_sinc(float x)
     return (fabsf(x) < 1e-5f) ? (1.0f - x * x * ONE_OVER_6) : sinf(x) / x;
 }
 
+__device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst_f8)
+{
+    for(int i = 0; i < 8; i++)
+        src_f8->f1[i] = (!src_f8->f1[i]) ? std::nextafter(0.0f, 1.0f) : fabsf(src_f8->f1[i]);
+
+    dst_f8->f1[0] = __logf(src_f8->f1[0]);
+    dst_f8->f1[1] = __logf(src_f8->f1[1]);
+    dst_f8->f1[2] = __logf(src_f8->f1[2]);
+    dst_f8->f1[3] = __logf(src_f8->f1[3]);
+    dst_f8->f1[4] = __logf(src_f8->f1[4]);
+    dst_f8->f1[5] = __logf(src_f8->f1[5]);
+    dst_f8->f1[6] = __logf(src_f8->f1[6]);
+    dst_f8->f1[7] = __logf(src_f8->f1[7]);
+}
+
 // /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/
 
 template<typename T>
@@ -1913,7 +1944,8 @@ __device__ __forceinline__ float rpp_hip_rng_xorwow_f32(T *xorwowState)
     return  outFloat - 1;                                                                           // return 0 <= outFloat < 1
 }
 
-__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(RpptXorwowState *xorwowState, d_float8 *randomNumbersPtr_f8)
+template<typename T>
+__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(T *xorwowState, d_float8 *randomNumbersPtr_f8)
 {
     randomNumbersPtr_f8->f1[0] = rpp_hip_rng_xorwow_f32(xorwowState);
     randomNumbersPtr_f8->f1[1] = rpp_hip_rng_xorwow_f32(xorwowState);
diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
index b98145be0..466e51e09 100644
--- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp
+++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
 #include "kernel/subtract_scalar.hpp"
 #include "kernel/multiply_scalar.hpp"
 #include "kernel/magnitude.hpp"
+#include "kernel/log.hpp"
 
 #endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp
index 9388ed6bd..ce7450aab 100644
--- a/src/modules/cpu/host_tensor_effects_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp
@@ -31,6 +31,8 @@ SOFTWARE.
 #include "kernel/noise_shot.hpp"
 #include "kernel/noise_gaussian.hpp"
 #include "kernel/non_linear_blend.hpp"
+#include "kernel/jitter.hpp"
+#include "kernel/glitch.hpp"
 #include "kernel/water.hpp"
 #include "kernel/ricap.hpp"
 #include "kernel/vignette.hpp"
diff --git a/src/modules/cpu/host_tensor_geometric_augmentations.hpp b/src/modules/cpu/host_tensor_geometric_augmentations.hpp
index cc7a22c8f..9facb0d78 100644
--- a/src/modules/cpu/host_tensor_geometric_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_geometric_augmentations.hpp
@@ -35,6 +35,8 @@ SOFTWARE.
 #include "kernel/warp_affine.hpp"
 #include "kernel/phase.hpp"
 #include "kernel/slice.hpp"
+#include "kernel/lens_correction.hpp"
+#include "kernel/transpose.hpp"
 #include "kernel/crop_and_patch.hpp"
 #include "kernel/flip_voxel.hpp"
 
diff --git a/src/modules/cpu/kernel/color_temperature.hpp b/src/modules/cpu/kernel/color_temperature.hpp
index 1358ac800..dbe33a51e 100644
--- a/src/modules/cpu/kernel/color_temperature.hpp
+++ b/src/modules/cpu/kernel/color_temperature.hpp
@@ -30,7 +30,7 @@ RppStatus color_temperature_u8_u8_host_tensor(Rpp8u *srcPtr,
                                               RpptDescPtr srcDescPtr,
                                               Rpp8u *dstPtr,
                                               RpptDescPtr dstDescPtr,
-                                              Rpp8s *adjustmentValueTensor,
+                                              Rpp32s *adjustmentValueTensor,
                                               RpptROIPtr roiTensorPtrSrc,
                                               RpptRoiType roiType,
                                               RppLayoutParams layoutParams)
@@ -269,7 +269,7 @@ RppStatus color_temperature_f32_f32_host_tensor(Rpp32f *srcPtr,
                                                 RpptDescPtr srcDescPtr,
                                                 Rpp32f *dstPtr,
                                                 RpptDescPtr dstDescPtr,
-                                                Rpp8s *adjustmentValueTensor,
+                                                Rpp32s *adjustmentValueTensor,
                                                 RpptROIPtr roiTensorPtrSrc,
                                                 RpptRoiType roiType,
                                                 RppLayoutParams layoutParams)
@@ -508,7 +508,7 @@ RppStatus color_temperature_f16_f16_host_tensor(Rpp16f *srcPtr,
                                                 RpptDescPtr srcDescPtr,
                                                 Rpp16f *dstPtr,
                                                 RpptDescPtr dstDescPtr,
-                                                Rpp8s *adjustmentValueTensor,
+                                                Rpp32s *adjustmentValueTensor,
                                                 RpptROIPtr roiTensorPtrSrc,
                                                 RpptRoiType roiType,
                                                 RppLayoutParams layoutParams)
@@ -799,7 +799,7 @@ RppStatus color_temperature_i8_i8_host_tensor(Rpp8s *srcPtr,
                                               RpptDescPtr srcDescPtr,
                                               Rpp8s *dstPtr,
                                               RpptDescPtr dstDescPtr,
-                                              Rpp8s *adjustmentValueTensor,
+                                              Rpp32s *adjustmentValueTensor,
                                               RpptROIPtr roiTensorPtrSrc,
                                               RpptRoiType roiType,
                                               RppLayoutParams layoutParams)
diff --git a/src/modules/cpu/kernel/glitch.hpp b/src/modules/cpu/kernel/glitch.hpp
new file mode 100644
index 000000000..9a8e33410
--- /dev/null
+++ b/src/modules/cpu/kernel/glitch.hpp
@@ -0,0 +1,690 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+inline void compute_src_loc(int row , int col, Rpp32s *locArray, RpptDescPtr srcDescPtr, RpptChannelOffsets *rgbOffsets, RpptROI roi, int batchCount, int channelValue)
+{
+    int xR, yR, xG, yG, xB, yB;
+    xR = col + rgbOffsets[batchCount].r.x;
+    yR = row + rgbOffsets[batchCount].r.y;
+    xG = col + rgbOffsets[batchCount].g.x;
+    yG = row + rgbOffsets[batchCount].g.y;
+    xB = col + rgbOffsets[batchCount].b.x;
+    yB = row + rgbOffsets[batchCount].b.y;
+
+    if (xR >= roi.xywhROI.roiWidth || xR < roi.xywhROI.xy.x || yR >= roi.xywhROI.roiHeight || yR < roi.xywhROI.xy.y)
+    {
+        xR = col;
+        yR = row;
+    }
+
+    if (xG >= roi.xywhROI.roiWidth || xG < roi.xywhROI.xy.x || yG >= roi.xywhROI.roiHeight || yG < roi.xywhROI.xy.y)
+    {
+        xG = col;
+        yG = row;
+    }
+
+    if (xB >= roi.xywhROI.roiWidth || xB < roi.xywhROI.xy.x || yB >= roi.xywhROI.roiHeight || yB < roi.xywhROI.xy.y)
+    {
+        xB = col;
+        yB = row;
+    }
+
+    locArray[0] = yR * srcDescPtr->strides.hStride + xR * channelValue;
+    locArray[1] = yG * srcDescPtr->strides.hStride + xG * channelValue;
+    locArray[2] = yB * srcDescPtr->strides.hStride + xB * channelValue;
+}
+
+RppStatus glitch_u8_u8_host_tensor(Rpp8u *srcPtr,
+                                   RpptDescPtr srcDescPtr,
+                                   Rpp8u *dstPtr,
+                                   RpptDescPtr dstDescPtr,
+                                   RpptChannelOffsets *rgbOffsets,
+                                   RpptROIPtr roiTensorPtrSrc,
+                                   RpptRoiType roiType,
+                                   RppLayoutParams layoutParams,
+                                   rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32s glitchSrcLocArray[3] = {0};     // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+        Rpp8u *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp8u *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8u *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u* dstRowPtrTempR = dstPtrRow;
+                Rpp8u* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+                Rpp8u* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    __m256 p[3];
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    rpp_simd_load(rpp_glitch_load24_u8pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray);
+                    rpp_simd_store(rpp_store24_f32pln3_to_u8pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p);    // simd stores
+
+                    dstRowPtrTempR += 8;
+                    dstRowPtrTempG += 8;
+                    dstRowPtrTempB += 8;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+                    *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+                    *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8u *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u vectorIncrement = 16;
+           Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 16) * 16) - 16;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+                {
+                    __m256 p[6];
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    Rpp32u rLoc = glitchSrcLocArray[0];
+                    Rpp32u gLoc = srcDescPtr->strides.cStride + glitchSrcLocArray[1];
+                    Rpp32u bLoc = 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2];
+                    rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrChannel + rLoc, srcPtrChannel + gLoc, srcPtrChannel + bLoc, p);
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p);    // simd stores
+                    dstPtrTemp += 48;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 3;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+        {
+            Rpp8u *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u vectorIncrement = 32;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 32) * 32) - 32;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                    {
+                        __m256i p;
+                        p = _mm256_loadu_si256((__m256i *)(srcPtrChannel + (glitchSrcLocArray[c] + (c * srcDescPtr->strides.cStride))));
+                        _mm256_storeu_si256((__m256i *)(dstPtrTemp + (c * srcDescPtr->strides.cStride)), p);
+                    }
+                    dstPtrTemp += 32;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 1;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+        else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8u *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 10) * 10) - 10;
+            Rpp32s vectorIncrement = 10;
+            Rpp32s vectorIncrementPkd = 30;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 10)
+                {
+                    __m256i p;
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    rpp_simd_load(rpp_glitch_load30_u8pkd3_to_u8pkd3_avx, srcPtrChannel, glitchSrcLocArray, p);
+                    _mm256_storeu_si256((__m256i *)(dstPtrTemp), p);
+                    dstPtrTemp += 30;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    for (int c = 0; c < 3; c++)
+                        *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus glitch_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                     RpptDescPtr srcDescPtr,
+                                     Rpp32f *dstPtr,
+                                     RpptDescPtr dstDescPtr,
+                                     RpptChannelOffsets *rgbOffsets,
+                                     RpptROIPtr roiTensorPtrSrc,
+                                     RpptRoiType roiType,
+                                     RppLayoutParams layoutParams,
+                                     rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32s glitchSrcLocArray[3] = {0};     // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+           Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f* dstRowPtrTempR = dstPtrRow;
+                Rpp32f* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+                Rpp32f* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    __m256 p[3];
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    rpp_simd_load(rpp_glitch_load24_f32pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray);
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p);    // simd stores
+
+                    dstRowPtrTempR += 8;
+                    dstRowPtrTempG += 8;
+                    dstRowPtrTempB += 8;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+                    *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+                    *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u vectorIncrement = 8;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    __m256 p[3];
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    p[0] = _mm256_loadu_ps(srcPtrChannel + glitchSrcLocArray[0]);
+                    p[1] = _mm256_loadu_ps(srcPtrChannel + srcDescPtr->strides.cStride + glitchSrcLocArray[1]);
+                    p[2] = _mm256_loadu_ps(srcPtrChannel + 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2]);
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p);    // simd stores
+                    dstPtrTemp += 24;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 3;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+        {
+            Rpp32f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u vectorIncrement = 8;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                    {
+                        __m256 p;
+                        p = _mm256_loadu_ps(srcPtrChannel + (glitchSrcLocArray[c] + c * srcDescPtr->strides.cStride));
+                        _mm256_storeu_ps((dstPtrTemp + c * srcDescPtr->strides.cStride), p);
+                    }
+                    dstPtrTemp += 8;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 1;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+        else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 2) * 2) - 2;
+            Rpp32s vectorIncrement = 2;
+            Rpp32s vectorIncrementPkd = 6;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 2)
+                {
+                    __m256 p;
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    rpp_simd_load(rpp_glitch_load6_f32pkd3_to_f32pkd3_avx, srcPtrChannel, glitchSrcLocArray, p);
+                    _mm256_storeu_si256((__m256i *)(dstPtrTemp), p);
+                    dstPtrTemp += 6;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    for (int c = 0; c < 3; c++)
+                        *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus glitch_f16_f16_host_tensor(Rpp16f *srcPtr,
+                                     RpptDescPtr srcDescPtr,
+                                     Rpp16f *dstPtr,
+                                     RpptDescPtr dstDescPtr,
+                                     RpptChannelOffsets *rgbOffsets,
+                                     RpptROIPtr roiTensorPtrSrc,
+                                     RpptRoiType roiType,
+                                     RppLayoutParams layoutParams,
+                                     rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32s glitchSrcLocArray[3] = {0};     // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+        Rpp16f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp16f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp16f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f* dstRowPtrTempR = dstPtrRow;
+                Rpp16f* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+                Rpp16f* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+                for (int vectorLoopCount = 0; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+                    *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+                    *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp16f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f* dstPtrTemp = dstPtrRow;
+                for (int vectorLoopCount = 0; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 3;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+        {
+            Rpp16f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f* dstPtrTemp = dstPtrRow;
+                for (int i = 0; i < roi.xywhROI.roiWidth; i++)
+                {
+                    compute_src_loc(dstLocRow, i, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 1;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+        else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp16f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f* dstPtrTemp = dstPtrRow;
+                for (int i = 0; i < roi.xywhROI.roiWidth; i++)
+                {
+                    compute_src_loc(dstLocRow, i, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    for (int c = 0; c < 3; c++)
+                        *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+    }
+    return RPP_SUCCESS;
+}
+
+RppStatus glitch_i8_i8_host_tensor(Rpp8s *srcPtr,
+                                   RpptDescPtr srcDescPtr,
+                                   Rpp8s *dstPtr,
+                                   RpptDescPtr dstDescPtr,
+                                   RpptChannelOffsets *rgbOffsets,
+                                   RpptROIPtr roiTensorPtrSrc,
+                                   RpptRoiType roiType,
+                                   RppLayoutParams layoutParams,
+                                   rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads();
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32s glitchSrcLocArray[3] = {0};     // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+        Rpp8s *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+        Rpp8s *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8s *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s* dstRowPtrTempR = dstPtrRow;
+                Rpp8s* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+                Rpp8s* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+                {
+                    __m256 p[3];
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    rpp_simd_load(rpp_glitch_load24_i8pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray);
+                    rpp_simd_store(rpp_store24_f32pln3_to_i8pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p);    // simd stores
+
+                    dstRowPtrTempR += 8;
+                    dstRowPtrTempG += 8;
+                    dstRowPtrTempB += 8;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+                    *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+                    *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8s *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u vectorIncrement = 16;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 16) * 16) - 16;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+                {
+                    __m256 p[6];
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    Rpp32u rLoc = glitchSrcLocArray[0];
+                    Rpp32u gLoc = srcDescPtr->strides.cStride + glitchSrcLocArray[1];
+                    Rpp32u bLoc = 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2];
+                    rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrChannel + rLoc, srcPtrChannel + gLoc, srcPtrChannel + bLoc, p);
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p);    // simd stores
+                    dstPtrTemp += 48;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 3;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+        {
+            Rpp8s *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u vectorIncrement = 32;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 32) * 32) - 32;
+
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                    {
+                        __m256i p;
+                        p = _mm256_loadu_si256((__m256i *)(srcPtrChannel + (glitchSrcLocArray[c] + (c * srcDescPtr->strides.cStride))));
+                        _mm256_storeu_si256((__m256i *)(dstPtrTemp + (c * srcDescPtr->strides.cStride)), p);
+                    }
+                    dstPtrTemp += 32;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+                    for (int c = 0; c < 3; c++)
+                        *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+                    dstPtrTemp += 1;
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+        else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8s *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 10) * 10) - 10;
+            Rpp32s vectorIncrement = 10;
+            Rpp32s vectorIncrementPkd = 30;
+            for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s* dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += 10)
+                {
+                    __m256i p;
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    rpp_simd_load(rpp_glitch_load30_i8pkd3_to_i8pkd3_avx, srcPtrChannel, glitchSrcLocArray, p);
+                    _mm256_storeu_si256((__m256i *)(dstPtrTemp), p);
+                    dstPtrTemp += 30;
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+                    for (int c = 0; c < 3; c++)
+                        *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+                }
+
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+
+        }
+    }
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/jitter.hpp b/src/modules/cpu/kernel/jitter.hpp
new file mode 100644
index 000000000..ec717150a
--- /dev/null
+++ b/src/modules/cpu/kernel/jitter.hpp
@@ -0,0 +1,929 @@
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus jitter_u8_u8_host_tensor(Rpp8u *srcPtr,
+                                   RpptDescPtr srcDescPtr,
+                                   Rpp8u *dstPtr,
+                                   RpptDescPtr dstDescPtr,
+                                   Rpp32u *kernelSizeTensor,
+                                   RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                   RpptROIPtr roiTensorPtrSrc,
+                                   RpptRoiType roiType,
+                                   RppLayoutParams layoutParams,
+                                   rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads(); 
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32u kernelSize = kernelSizeTensor[batchCount];
+        Rpp32u bound = (kernelSize - 1) / 2;
+        Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+        Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+        Rpp8u *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp8u *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7;   // Align dst width to process 4 dst pixels per iteration
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+        RpptXorwowStateBoxMuller xorwowState;
+        Rpp32s srcLocArray[8] = {0};
+
+        __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+        rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+        __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+        __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+        __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+        __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+        __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth - 1);
+        __m256 pBound = _mm256_set1_ps(bound);
+
+        // Jitter with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8u *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i pxRow;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow);
+                    rpp_simd_store(rpp_store24_u8pkd3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, srcDescPtr->c, loc);
+                    *dstPtrTempR++ = *(srcPtrChannel + loc);
+                    *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+                    *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+                }
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8u *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i pxRow[3];
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]);
+                    rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]);
+                    rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]);
+                    rpp_simd_store(rpp_store24_u8pln3_to_u8pkd3_avx, dstPtrTemp, pxRow);
+                    dstPtrTemp += vectorIncrement;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTemp++ = *(srcPtrRowR + loc);
+                    *dstPtrTemp++ = *(srcPtrRowG + loc);
+                    *dstPtrTemp++ = *(srcPtrRowB + loc);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter without fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8u *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i pxRow;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow);
+                    rpp_simd_store(rpp_store24_u8_to_u8_avx, dstPtrTemp, pxRow);
+                    dstPtrTemp += vectorIncrement;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTemp++ = *(srcPtrRow + loc);
+                    *dstPtrTemp++ = *(srcPtrRow + 1 + loc);
+                    *dstPtrTemp++ = *(srcPtrRow + 2 + loc);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8u *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8u *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp8u *dstPtrTempChn, *srcPtrTempChn;
+                    srcPtrTempChn = srcPtrChannel;
+                    dstPtrTempChn = dstPtrTemp;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    for(int c = 0; c < srcDescPtr->c; c++)
+                    {
+                        __m256i pxRow;
+                        rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow);
+                        rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow));
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp8u *dstPtrTempChn = dstPtrTemp;
+                    Rpp8u *srcPtrTempChn = srcPtrChannel;
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    for(int c = 0; c < srcDescPtr->c; c++)
+                    {
+                        *dstPtrTempChn = *(srcPtrTempChn + loc);
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp++;
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus jitter_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                     RpptDescPtr srcDescPtr,
+                                     Rpp32f *dstPtr,
+                                     RpptDescPtr dstDescPtr,
+                                     Rpp32u *kernelSizeTensor,
+                                     RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                     RpptROIPtr roiTensorPtrSrc,
+                                     RpptRoiType roiType,
+                                     RppLayoutParams layoutParams,
+                                     rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads(); 
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32u kernelSize = kernelSizeTensor[batchCount];
+        Rpp32u bound = (kernelSize - 1) / 2;
+        Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+        Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7;   // Align dst width to process 4 dst pixels per iteration
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+        RpptXorwowStateBoxMuller xorwowState;
+        Rpp32s srcLocArray[8] = {0}; 
+
+        __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+        rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+        __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+        __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+        __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+        __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+        __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+        __m256 pBound = _mm256_set1_ps(bound);
+
+
+        // Jitter with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+                
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 pxRow[3];
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_simd_load(rpp_resize_nn_load_f32pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow);
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTempR++ = *(srcPtrChannel + loc);
+                    *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+                    *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+                }
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256 pxRow[4];
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]);
+                    rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]);
+                    rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]);
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, pxRow);
+                    dstPtrTemp += vectorIncrement;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTemp++ = *(srcPtrRowR + loc);
+                    *dstPtrTemp++ = *(srcPtrRowG + loc);
+                    *dstPtrTemp++ = *(srcPtrRowB + loc);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter without fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32f *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    __m256 pRow;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, (srcPtrChannel + loc), &pRow);
+                    rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &pRow);
+                    dstPtrTemp += 3;
+                }
+#endif
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        // Jitter with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp32f *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f *srcPtrTempChn, *dstPtrTempChn;
+                    srcPtrTempChn = srcPtrChannel;
+                    dstPtrTempChn = dstPtrTemp;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+
+                    for (int c = 0; c < dstDescPtr->c; c++)
+                    {
+                        __m256 pxRow;
+                        rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrTempChn, srcLocArray, pxRow);
+                        rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTempChn, &pxRow);
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32f *dstPtrTempChn = dstPtrTemp;
+                    Rpp32f *srcPtrTempChn = srcPtrChannel;
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    for(int c = 0; c < srcDescPtr->c; c++)
+                    {
+                        *dstPtrTempChn = (Rpp32f)*(srcPtrTempChn + loc);
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp++;
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus jitter_f16_f16_host_tensor(Rpp16f *srcPtr,
+                                     RpptDescPtr srcDescPtr,
+                                     Rpp16f *dstPtr,
+                                     RpptDescPtr dstDescPtr,
+                                     Rpp32u *kernelSizeTensor,
+                                     RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                     RpptROIPtr roiTensorPtrSrc,
+                                     RpptRoiType roiType,
+                                     RppLayoutParams layoutParams,
+                                     rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads(); 
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32u kernelSize = kernelSizeTensor[batchCount];
+        Rpp32u bound = (kernelSize - 1) / 2;
+        Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+        Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+        Rpp16f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp16f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7;   // Align dst width to process 4 dst pixels per iteration
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+        RpptXorwowStateBoxMuller xorwowState;
+        Rpp32s srcLocArray[8] = {0}; 
+
+        __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+        rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+        __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+        __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+        __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+        __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+        __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+        __m256 pBound = _mm256_set1_ps(bound);
+
+
+        // Jitter with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp16f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+                    __m256 pxRow[3];
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_simd_load(rpp_resize_nn_load_f16pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow);
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, pxRow);
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+                        dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+                        dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+                    }
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTempR++ = *(srcPtrChannel + loc);
+                    *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+                    *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+                }
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp16f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp32f dstPtrTemp_ps[25];
+                    __m256 pxRow[4];
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]);
+                    rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]);
+                    rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]);
+                    rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, pxRow);
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+                    dstPtrTemp += vectorIncrement;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTemp++ = *(srcPtrRowR + loc);
+                    *dstPtrTemp++ = *(srcPtrRowG + loc);
+                    *dstPtrTemp++ = *(srcPtrRowB + loc);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter without fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp16f *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32f srcPtrTemp_ps[8], dstPtrTemp_ps[8];
+                    Rpp32s loc;
+                    __m256 pRow;
+
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        srcPtrTemp_ps[cnt] = (Rpp16f)srcPtrChannel[loc + cnt];
+                    }
+
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &pRow);
+                    rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pRow);
+
+                    for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                    {
+                        dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+                    }
+                    dstPtrTemp += 3;
+                }
+#endif
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        // Jitter with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp16f *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp16f *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp16f *srcPtrTempChn, *dstPtrTempChn;
+                    srcPtrTempChn = srcPtrChannel;
+                    dstPtrTempChn = dstPtrTemp;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+
+                    for (int c = 0; c < dstDescPtr->c; c++)
+                    {
+                        Rpp32f dstPtrTemp_ps[8];
+                        __m256 pxRow;
+                        rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrTempChn, srcLocArray, pxRow);
+                        rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pxRow);
+                        for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+                        {
+                            dstPtrTempChn[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+                        }
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp16f *dstPtrTempChn = dstPtrTemp;
+                    Rpp16f *srcPtrTempChn = srcPtrChannel;
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    for(int c = 0; c < srcDescPtr->c; c++)
+                    {
+                        *dstPtrTempChn = (Rpp16f)*(srcPtrTempChn + loc);
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp++;
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus jitter_i8_i8_host_tensor(Rpp8s *srcPtr,
+                                   RpptDescPtr srcDescPtr,
+                                   Rpp8s *dstPtr,
+                                   RpptDescPtr dstDescPtr,
+                                   Rpp32u *kernelSizeTensor,
+                                   RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                   RpptROIPtr roiTensorPtrSrc,
+                                   RpptRoiType roiType,
+                                   RppLayoutParams layoutParams,
+                                   rpp::Handle& handle)
+{
+    RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+    Rpp32u numThreads = handle.GetNumThreads(); 
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+        compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+        Rpp32u kernelSize = kernelSizeTensor[batchCount];
+        Rpp32u bound = (kernelSize - 1) / 2;
+        Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+        Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+        Rpp8s *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp8s *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7;   // Align dst width to process 4 dst pixels per iteration
+        Rpp32u vectorIncrement = 24;
+        Rpp32u vectorIncrementPerChannel = 8;
+        RpptXorwowStateBoxMuller xorwowState;
+        Rpp32s srcLocArray[8] = {0};
+
+        __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+        rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+        __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+        __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+        __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+        __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+        __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+        __m256 pBound = _mm256_set1_ps(bound);
+
+        // Jitter with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8s *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+                
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i pxRow;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow);
+                    rpp_simd_store(rpp_store24_i8pkd3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+                    dstPtrTempR += vectorIncrementPerChannel;
+                    dstPtrTempG += vectorIncrementPerChannel;
+                    dstPtrTempB += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTempR++ = *(srcPtrChannel + loc);
+                    *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+                    *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+                }
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8s *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i pxRow[3];
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]);
+                    rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]);
+                    rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]);
+                    rpp_simd_store(rpp_store24_i8pln3_to_i8pkd3_avx, dstPtrTemp, pxRow);
+                    dstPtrTemp += vectorIncrement;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTemp++ = *(srcPtrRowR + loc);
+                    *dstPtrTemp++ = *(srcPtrRowG + loc);
+                    *dstPtrTemp++ = *(srcPtrRowB + loc);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Jitter without fused output-layout toggle (NHWC -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp8s *srcPtrRow, *dstPtrRow;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRow = dstPtrChannel;
+
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    __m256i pxRow;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow);
+                    rpp_simd_store(rpp_store24_i8_to_i8_avx, dstPtrTemp, pxRow);
+                    dstPtrTemp += vectorIncrement;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + loc);
+                    *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 1 + loc);
+                    *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 2 + loc);
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+        // Jitter with fused output-layout toggle (NCHW -> NCHW)
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp8s *dstPtrRow;
+            dstPtrRow = dstPtrChannel;
+            for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+            {
+                Rpp8s *dstPtrTemp;
+                dstPtrTemp = dstPtrRow;
+
+                __m256 pRow = _mm256_set1_ps(dstLocRow);
+                __m256 pCol = avx_pDstLocInit;
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+                {
+                    Rpp8s *dstPtrTempChn, *srcPtrTempChn;
+                    srcPtrTempChn = srcPtrChannel;
+                    dstPtrTempChn = dstPtrTemp;
+                    compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+                    for(int c = 0; c < srcDescPtr->c; c++)
+                    {
+                        __m256i pxRow;
+                        rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow);
+                        rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow));
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp += vectorIncrementPerChannel;
+                    pCol = _mm256_add_ps(avx_p8, pCol);
+                }
+#endif
+                for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+                {
+                    Rpp8s *dstPtrTempChn = dstPtrTemp;
+                    Rpp8s *srcPtrTempChn = srcPtrChannel;
+                    Rpp32s loc;
+                    compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+                    for(int c = 0; c < srcDescPtr->c; c++)
+                    {
+                        *dstPtrTempChn = (Rpp8s)*(srcPtrTempChn + loc);
+                        srcPtrTempChn += srcDescPtr->strides.cStride;
+                        dstPtrTempChn += dstDescPtr->strides.cStride;
+                    }
+                    dstPtrTemp++;
+                }
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/lens_correction.hpp b/src/modules/cpu/kernel/lens_correction.hpp
new file mode 100644
index 000000000..1632568a5
--- /dev/null
+++ b/src/modules/cpu/kernel/lens_correction.hpp
@@ -0,0 +1,178 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+#include <omp.h>
+
+// Compute Inverse matrix (3x3)
+inline void get_inverse(float *mat, float *invMat)
+{
+    float det = mat[0] * (mat[4] * mat[8] - mat[7] * mat[5]) - mat[1] * (mat[3] * mat[8] - mat[5] * mat[6]) + mat[2] * (mat[3] * mat[7] - mat[4] * mat[6]);
+    if(det != 0)
+    {
+        float invDet = 1 / det;
+        invMat[0] = (mat[4] * mat[8] - mat[7] * mat[5]) * invDet;
+        invMat[1] = (mat[2] * mat[7] - mat[1] * mat[8]) * invDet;
+        invMat[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * invDet;
+        invMat[3] = (mat[5] * mat[6] - mat[3] * mat[8]) * invDet;
+        invMat[4] = (mat[0] * mat[8] - mat[2] * mat[6]) * invDet;
+        invMat[5] = (mat[3] * mat[2] - mat[0] * mat[5]) * invDet;
+        invMat[6] = (mat[3] * mat[7] - mat[6] * mat[4]) * invDet;
+        invMat[7] = (mat[6] * mat[1] - mat[0] * mat[7]) * invDet;
+        invMat[8] = (mat[0] * mat[4] - mat[3] * mat[1]) * invDet;
+    }
+}
+
+inline void compute_lens_correction_remap_tables_host_tensor(RpptDescPtr srcDescPtr,
+                                                             Rpp32f *rowRemapTable,
+                                                             Rpp32f *colRemapTable,
+                                                             RpptDescPtr tableDescPtr,
+                                                             Rpp32f *cameraMatrixTensor,
+                                                             Rpp32f *distortionCoeffsTensor,
+                                                             RpptROIPtr roiTensorPtrSrc,
+                                                             rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        Rpp32f *rowRemapTableTemp, *colRemapTableTemp;
+        rowRemapTableTemp = rowRemapTable + batchCount * tableDescPtr->strides.nStride;
+        colRemapTableTemp = colRemapTable + batchCount * tableDescPtr->strides.nStride;
+
+        // cameraMatrix is a 3x3 matrix thus increment by 9 to iterate from one tensor in a batch to another
+        Rpp32f *cameraMatrix = cameraMatrixTensor + batchCount * 9;
+        Rpp32f *distortionCoeffs = distortionCoeffsTensor + batchCount * 8;
+        Rpp32s height = roiTensorPtrSrc[batchCount].xywhROI.roiHeight;
+        Rpp32s width = roiTensorPtrSrc[batchCount].xywhROI.roiWidth;
+        Rpp32u alignedLength = width & ~7;
+        Rpp32s vectorIncrement = 8;
+
+        Rpp32f invCameraMatrix[9];
+        std::fill(invCameraMatrix, invCameraMatrix + 9, 0.0f);  // initialize all values in invCameraMatrix to zero
+        get_inverse(cameraMatrix, invCameraMatrix);
+        Rpp32f *invMat = &invCameraMatrix[0];
+
+        // Get radial and tangential distortion coefficients
+        Rpp32f rCoeff[6] = { distortionCoeffs[0], distortionCoeffs[1], distortionCoeffs[4], distortionCoeffs[5], distortionCoeffs[6], distortionCoeffs[7] };
+        Rpp32f tCoeff[2] = { distortionCoeffs[2], distortionCoeffs[3] };
+
+        __m256 pRCoeff[6], pTCoeff[2];
+        pRCoeff[0] = _mm256_set1_ps(rCoeff[0]);
+        pRCoeff[1] = _mm256_set1_ps(rCoeff[1]);
+        pRCoeff[2] = _mm256_set1_ps(rCoeff[2]);
+        pRCoeff[3] = _mm256_set1_ps(rCoeff[3]);
+        pRCoeff[4] = _mm256_set1_ps(rCoeff[4]);
+        pRCoeff[5] = _mm256_set1_ps(rCoeff[5]);
+        pTCoeff[0] = _mm256_set1_ps(tCoeff[0]);
+        pTCoeff[1] = _mm256_set1_ps(tCoeff[1]);
+
+        Rpp32f u0 = cameraMatrix[2],  v0 = cameraMatrix[5];
+        Rpp32f fx = cameraMatrix[0],  fy = cameraMatrix[4];
+        __m256 pFx, pFy, pU0, pV0;
+        pFx = _mm256_set1_ps(fx);
+        pFy = _mm256_set1_ps(fy);
+        pU0 = _mm256_set1_ps(u0);
+        pV0 = _mm256_set1_ps(v0);
+
+        __m256 pInvMat0, pInvMat3, pInvMat6;
+        pInvMat0 = _mm256_set1_ps(invMat[0]);
+        pInvMat3 = _mm256_set1_ps(invMat[3]);
+        pInvMat6 = _mm256_set1_ps(invMat[6]);
+
+        __m256 pXCameraInit, pYCameraInit, pZCameraInit;
+        __m256 pXCameraIncrement, pYCameraIncrement, pZCameraIncrement;
+        pXCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat0);
+        pYCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat3);
+        pZCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat6);
+        pXCameraIncrement = _mm256_mul_ps(pInvMat0, avx_p8);
+        pYCameraIncrement = _mm256_mul_ps(pInvMat3, avx_p8);
+        pZCameraIncrement = _mm256_mul_ps(pInvMat6, avx_p8);
+        for(int i = 0; i < height; i++)
+        {
+            Rpp32f *rowRemapTableRow = rowRemapTableTemp + i * tableDescPtr->strides.hStride;
+            Rpp32f *colRemapTableRow = colRemapTableTemp + i * tableDescPtr->strides.hStride;
+            Rpp32f xCamera = i * invMat[1] + invMat[2];
+            Rpp32f yCamera = i * invMat[4] + invMat[5];
+            Rpp32f zCamera = i * invMat[7] + invMat[8];
+            __m256 pXCamera = _mm256_add_ps(_mm256_set1_ps(xCamera), pXCameraInit);
+            __m256 pYCamera = _mm256_add_ps(_mm256_set1_ps(yCamera), pYCameraInit);
+            __m256 pZCamera = _mm256_add_ps(_mm256_set1_ps(zCamera), pZCameraInit);
+            int vectorLoopCount = 0;
+            for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+            {
+                // float z = 1./zCamera, x = xCamera*z, y = yCamera*z;
+                __m256 pZ = _mm256_div_ps(avx_p1, pZCamera);
+                __m256 pX = _mm256_mul_ps(pXCamera, pZ);
+                __m256 pY = _mm256_mul_ps(pYCamera, pZ);
+
+                // float xSquare = x*x, ySquare = y*y, r2 = xSquare + ySquare;
+                __m256 pXSquare = _mm256_mul_ps(pX, pX);
+                __m256 pYSquare = _mm256_mul_ps(pY, pY);
+                __m256 pR2 = _mm256_add_ps(pXSquare, pYSquare);
+
+                // float xyMul2 = 2*x*y;
+                __m256 p2xy = _mm256_mul_ps(avx_p2, _mm256_mul_ps(pX, pY));
+
+                // float kr = std::fmaf(std::fmaf(std::fmaf(rCoeff[2], r2, rCoeff[1]), r2, rCoeff[0]), r2, 1) / std::fmaf(std::fmaf(std::fmaf(rCoeff[5], r2, rCoeff[4]), r2, rCoeff[3]), r2, 1);
+                __m256 pNum = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(pRCoeff[2], pR2, pRCoeff[1]), pR2, pRCoeff[0]), pR2, avx_p1);
+                __m256 pDen = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(pRCoeff[5], pR2, pRCoeff[4]), pR2, pRCoeff[3]), pR2, avx_p1);
+                __m256 pKR = _mm256_div_ps(pNum, pDen);
+
+                // float colLoc = std::fmaf(fx, (std::fmaf(tCoeff[1], (std::fmaf(2, xSquare, r2)), std::fmaf(x, kr, (tCoeff[0] * xyMul2)))), u0);
+                __m256 pColLoc = _mm256_fmadd_ps(pFx, _mm256_fmadd_ps(pTCoeff[1], _mm256_fmadd_ps(avx_p2, pXSquare, pR2), _mm256_fmadd_ps(pX, pKR,  _mm256_mul_ps(pTCoeff[0], p2xy))), pU0);
+
+                // float rowLoc = std::fmaf(fy, (std::fmaf(tCoeff[0], (std::fmaf(2, ySquare, r2)), std::fmaf(y, kr, (tCoeff[1] * xyMul2)))), v0);
+                __m256 pRowLoc = _mm256_fmadd_ps(pFy, _mm256_fmadd_ps(pTCoeff[0], _mm256_fmadd_ps(avx_p2, pYSquare, pR2), _mm256_fmadd_ps(pY, pKR,  _mm256_mul_ps(pTCoeff[1], p2xy))), pV0);
+
+                _mm256_storeu_ps(rowRemapTableRow, pRowLoc);
+                _mm256_storeu_ps(colRemapTableRow, pColLoc);
+                rowRemapTableRow += vectorIncrement;
+                colRemapTableRow += vectorIncrement;
+
+                // xCamera += invMat[0], yCamera += invMat[3], zCamera += invMat[6]
+                pXCamera = _mm256_add_ps(pXCamera, pXCameraIncrement);
+                pYCamera = _mm256_add_ps(pYCamera, pYCameraIncrement);
+                pZCamera = _mm256_add_ps(pZCamera, pZCameraIncrement);
+            }
+            for(; vectorLoopCount < width; vectorLoopCount++)
+            {
+                Rpp32f z = 1./zCamera, x = xCamera * z, y = yCamera * z;
+                Rpp32f xSquare = x * x, ySquare = y * y, r2 = xSquare + ySquare;
+                Rpp32f xyMul2 = 2 * x * y;
+                Rpp32f kr = std::fmaf(std::fmaf(std::fmaf(rCoeff[2], r2, rCoeff[1]), r2, rCoeff[0]), r2, 1) / std::fmaf(std::fmaf(std::fmaf(rCoeff[5], r2, rCoeff[4]), r2, rCoeff[3]), r2, 1);
+                Rpp32f colLoc = std::fmaf(fx, (std::fmaf(tCoeff[1], (std::fmaf(2, xSquare, r2)), std::fmaf(x, kr, (tCoeff[0] * xyMul2)))), u0);
+                Rpp32f rowLoc = std::fmaf(fy, (std::fmaf(tCoeff[0], (std::fmaf(2, ySquare, r2)), std::fmaf(y, kr, (tCoeff[1] * xyMul2)))), v0);
+                *rowRemapTableRow++ = rowLoc;
+                *colRemapTableRow++ = colLoc;
+                xCamera += invMat[0];
+                yCamera += invMat[3];
+                zCamera += invMat[6];
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/log.hpp b/src/modules/cpu/kernel/log.hpp
new file mode 100644
index 000000000..5ec79b21c
--- /dev/null
+++ b/src/modules/cpu/kernel/log.hpp
@@ -0,0 +1,563 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_common.hpp"
+
+// 1 pixel log helper functions
+// NOTE: log(0) leads to undefined thus using nextafter() to avoid this result
+//       Also negative values are converted to positive by taking absolute of inputs
+inline void compute_log(Rpp8u *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src); }
+inline void compute_log(Rpp8s *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src + 128); }
+inline void compute_log(Rpp16f *src, Rpp16f *dst) { *dst = (!*src) ? log(std::nextafter(0.0f, 1.0f)) : log(abs(*src)); }
+inline void compute_log(Rpp32f *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(abs(*src)); }
+
+// Computes ND log recursively
+template<typename T1, typename T2>
+void log_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim)
+{
+    if (!nDim)
+        compute_log(src, dst);
+    else
+    {
+        for (int i = 0; i < *dstShape; i++)
+        {
+            log_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1);
+            dst += *dstStrides;
+            src += *srcStrides;
+        }
+    }
+}
+
+RppStatus log_generic_host_tensor(Rpp8u *srcPtr,
+                                  RpptGenericDescPtr srcGenericDescPtr,
+                                  Rpp32f *dstPtr,
+                                  RpptGenericDescPtr dstGenericDescPtr,
+                                  Rpp32u *roiTensor,
+                                  rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[nDim];
+
+        Rpp8u *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        for(int i = 0; i < nDim; i++)
+            srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+        Rpp32u alignedLength;
+        Rpp32u vectorIncrement = 16;
+        if (nDim == 1)
+        {
+            alignedLength = length[0] & ~15;
+            int vectorLoopCount = 0;
+#if __AVX2__
+            for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+            {
+                __m256 p[2];
+
+                rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1, p);    // simd loads
+                compute_log_16_host(p);  // log compute
+                rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p);    // simd stores
+                srcPtr1 += vectorIncrement;
+                dstPtr1 += vectorIncrement;
+            }
+#endif
+            for (; vectorLoopCount < length[0]; vectorLoopCount++)
+            {
+                compute_log(srcPtr1, dstPtr1);
+                srcPtr1++;
+                dstPtr1++;
+            }
+        }
+        else if(nDim == 2)
+        {
+            alignedLength = length[1] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp8u *srcPtrTemp = srcPtr1;
+                Rpp32f *dstPtrTemp = dstPtr1;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p[2];
+
+                    rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p);    // simd loads
+                    compute_log_16_host(p);  // log compute
+                    rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                    srcPtrTemp += vectorIncrement;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < length[1]; vectorLoopCount++)
+                {
+                    compute_log(srcPtrTemp, dstPtrTemp);
+                    srcPtrTemp++;
+                    dstPtrTemp++;
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else if(nDim == 3)
+        {
+            alignedLength = length[2] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp8u *srcPtrRow = srcPtr1;
+                Rpp32f *dstPtrRow = dstPtr1;
+
+                for(int j = 0; j < length[1]; j++)
+                {
+                    Rpp8u *srcPtrTemp = srcPtrRow;
+                    Rpp32f *dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+
+                        rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        compute_log_16_host(p);  // log compute
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < length[2]; vectorLoopCount++)
+                    {
+                        compute_log(srcPtrTemp, dstPtrTemp);
+                        srcPtrTemp++;
+                        dstPtrTemp++;
+                    }
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else
+            log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp8s *srcPtr,
+                                  RpptGenericDescPtr srcGenericDescPtr,
+                                  Rpp32f *dstPtr,
+                                  RpptGenericDescPtr dstGenericDescPtr,
+                                  Rpp32u *roiTensor,
+                                  rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[nDim];
+
+        Rpp8s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        for(int i = 0; i < nDim; i++)
+            srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+        Rpp32u alignedLength;
+        Rpp32u vectorIncrement = 16;
+        if (nDim == 1)
+        {
+            alignedLength = length[0] & ~15;
+            int vectorLoopCount = 0;
+#if __AVX2__
+            for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+            {
+                __m256 p[2];
+
+                rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1, p);    // simd loads
+                compute_log_16_host(p);  // log compute
+                rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p);    // simd stores
+                srcPtr1 += vectorIncrement;
+                dstPtr1 += vectorIncrement;
+            }
+#endif
+            for (; vectorLoopCount < length[0]; vectorLoopCount++)
+            {
+                compute_log(srcPtr1, dstPtr1);
+                srcPtr1++;
+                dstPtr1++;
+            }
+        }
+        else if(nDim == 2)
+        {
+            alignedLength = length[1] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp8s *srcPtrTemp = srcPtr1;
+                Rpp32f *dstPtrTemp = dstPtr1;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p[2];
+
+                    rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p);    // simd loads
+                    compute_log_16_host(p);  // log compute
+                    rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                    srcPtrTemp += vectorIncrement;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < length[1]; vectorLoopCount++)
+                {
+                    compute_log(srcPtrTemp, dstPtrTemp);
+                    srcPtrTemp++;
+                    dstPtrTemp++;
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else if(nDim == 3)
+        {
+            alignedLength = length[2] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp8s *srcPtrRow = srcPtr1;
+                Rpp32f *dstPtrRow = dstPtr1;
+
+                for(int j = 0; j < length[1]; j++)
+                {
+                    Rpp8s *srcPtrTemp = srcPtrRow;
+                    Rpp32f *dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+
+                        rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        compute_log_16_host(p);  // log compute
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < length[2]; vectorLoopCount++)
+                    {
+                        compute_log(srcPtrTemp, dstPtrTemp);
+                        srcPtrTemp++;
+                        dstPtrTemp++;
+                    }
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else
+            log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp32f *srcPtr,
+                                  RpptGenericDescPtr srcGenericDescPtr,
+                                  Rpp32f *dstPtr,
+                                  RpptGenericDescPtr dstGenericDescPtr,
+                                  Rpp32u *roiTensor,
+                                  rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[nDim];
+
+        Rpp32f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        for(int i = 0; i < nDim; i++)
+            srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+        Rpp32u alignedLength;
+        Rpp32u vectorIncrement = 16;
+        if (nDim == 1)
+        {
+            alignedLength = length[0] & ~15;
+            int vectorLoopCount = 0;
+#if __AVX2__
+            for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+            {
+                __m256 p[2];
+
+                rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtr1, p);    // simd loads
+                compute_log_16_host(p);  // log compute
+                rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p);    // simd stores
+                srcPtr1 += vectorIncrement;
+                dstPtr1 += vectorIncrement;
+            }
+#endif
+            for (; vectorLoopCount < length[0]; vectorLoopCount++)
+            {
+                compute_log(srcPtr1, dstPtr1);
+                srcPtr1++;
+                dstPtr1++;
+            }
+        }
+        else if(nDim == 2)
+        {
+            alignedLength = length[1] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp32f *srcPtrTemp = srcPtr1;
+                Rpp32f *dstPtrTemp = dstPtr1;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p[2];
+
+                    rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                    compute_log_16_host(p);  // log compute
+                    rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                    srcPtrTemp += vectorIncrement;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < length[1]; vectorLoopCount++)
+                {
+                    compute_log(srcPtrTemp, dstPtrTemp);
+                    srcPtrTemp++;
+                    dstPtrTemp++;
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else if(nDim == 3)
+        {
+            alignedLength = length[2] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp32f *srcPtrRow = srcPtr1;
+                Rpp32f *dstPtrRow = dstPtr1;
+
+                for(int j = 0; j < length[1]; j++)
+                {
+                    Rpp32f *srcPtrTemp = srcPtrRow;
+                    Rpp32f *dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+
+                        rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        compute_log_16_host(p);  // log compute
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < length[2]; vectorLoopCount++)
+                    {
+                        compute_log(srcPtrTemp, dstPtrTemp);
+                        srcPtrTemp++;
+                        dstPtrTemp++;
+                    }
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else
+            log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp16f *srcPtr,
+                                  RpptGenericDescPtr srcGenericDescPtr,
+                                  Rpp16f *dstPtr,
+                                  RpptGenericDescPtr dstGenericDescPtr,
+                                  Rpp32u *roiTensor,
+                                  rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[nDim];
+
+        Rpp16f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        Rpp16f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        for(int i = 0; i < nDim; i++)
+            srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+        Rpp32u alignedLength;
+        Rpp32u vectorIncrement = 16;
+        if (nDim == 1)
+        {
+            int vectorLoopCount = 0;
+#if __AVX2__
+            for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+            {
+                Rpp32f srcPtrTemp_ps[16];
+                for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                    srcPtrTemp_ps[cnt] = static_cast<Rpp32f>(srcPtr1[cnt]);
+
+                __m256 p[2];
+                rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p);    // simd loads
+                compute_log_16_host(p);  // log compute
+                rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtr1, p);    // simd stores
+                srcPtr1 += vectorIncrement;
+                dstPtr1 += vectorIncrement;
+            }
+#endif
+            for (; vectorLoopCount < length[0]; vectorLoopCount++)
+            {
+                compute_log(srcPtr1, dstPtr1);
+                srcPtr1++;
+                dstPtr1++;
+            }
+        }
+        else if(nDim == 2)
+        {
+            alignedLength = length[1] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp16f *srcPtrTemp = srcPtr1;
+                Rpp16f *dstPtrTemp = dstPtr1;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    Rpp32f srcPtrTemp_ps[16];
+                    for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                        srcPtrTemp_ps[cnt] = static_cast<Rpp32f>(srcPtrTemp[cnt]);
+
+                    __m256 p[2];
+                    rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p);    // simd loads
+                    compute_log_16_host(p);  // log compute
+                    rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p);    // simd stores
+                    srcPtrTemp += vectorIncrement;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < length[1]; vectorLoopCount++)
+                {
+                    compute_log(srcPtrTemp, dstPtrTemp);
+                    srcPtrTemp++;
+                    dstPtrTemp++;
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else if(nDim == 3)
+        {
+            alignedLength = length[2] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp16f *srcPtrRow = srcPtr1;
+                Rpp16f *dstPtrRow = dstPtr1;
+
+                for(int j = 0; j < length[1]; j++)
+                {
+                    Rpp16f *srcPtrTemp = srcPtrRow;
+                    Rpp16f *dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        Rpp32f srcPtrTemp_ps[16];
+                        for(int cnt = 0; cnt < vectorIncrement; cnt++)
+                            srcPtrTemp_ps[cnt] = static_cast<Rpp32f>(srcPtrTemp[cnt]);
+
+                        __m256 p[2];
+                        rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p);    // simd loads
+                        compute_log_16_host(p);  // log compute
+                        rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p);    // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < length[2]; vectorLoopCount++)
+                    {
+                        compute_log(srcPtrTemp, dstPtrTemp);
+                        srcPtrTemp++;
+                        dstPtrTemp++;
+                    }
+                    srcPtrRow += srcGenericDescPtr->strides[2];
+                    dstPtrRow += dstGenericDescPtr->strides[2];
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else
+            log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp
index dbe746d1a..94a1fd9fa 100644
--- a/src/modules/cpu/kernel/normalize.hpp
+++ b/src/modules/cpu/kernel/normalize.hpp
@@ -26,21 +26,6 @@ SOFTWARE.
 #include "rpp_cpu_simd.hpp"
 #include "rpp_cpu_common.hpp"
 
-// Computes strides
-void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
-{
-    if (tensorDim > 0)
-    {
-        Rpp32u v = 1;
-        for (Rpp32u i = tensorDim - 1; i > 0; i--)
-        {
-            strides[i] = v;
-            v *= shape[i];
-        }
-        strides[0] = v;
-    }
-}
-
 // Recursive reduction helper function to compute difference of input with mean and squares them up
 template<typename T>
 void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean)
diff --git a/src/modules/cpu/kernel/transpose.hpp b/src/modules/cpu/kernel/transpose.hpp
new file mode 100644
index 000000000..233db1044
--- /dev/null
+++ b/src/modules/cpu/kernel/transpose.hpp
@@ -0,0 +1,434 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+using namespace std;
+
+inline void increment_ndim_ptr(Rpp32f **dstPtr, Rpp32u tensorDims, Rpp32u increment)
+{
+    for(int i = 0; i < tensorDims; i++)
+        dstPtr[i] += increment;
+}
+
+inline void rpp_store16_f32_f32_channelwise(Rpp32f **dstPtr, __m128 *p)
+{
+    _mm_storeu_ps(dstPtr[0], p[0]);
+    _mm_storeu_ps(dstPtr[1], p[1]);
+    _mm_storeu_ps(dstPtr[2], p[2]);
+    _mm_storeu_ps(dstPtr[3], p[3]);
+}
+
+inline void compute_2d_pln1_transpose(Rpp32f *srcPtrTemp, Rpp32f *dstPtrTemp, Rpp32u height, Rpp32u width, Rpp32u srcRowStride, Rpp32u dstRowStride)
+{
+    Rpp32u alignedRows = height & ~3;
+    Rpp32u alignedCols = width & ~7;
+    Rpp32u vectorIncrement = 8;
+    Rpp32u dstRowVectorStride = vectorIncrement * dstRowStride;
+
+    Rpp32s i = 0;
+    for(Rpp32s k = 0; i < alignedRows; i += 4, k++)
+    {
+        Rpp32f *srcPtrRow[4], *dstPtrRow[8];
+        for(int j = 0; j < 4; j++)
+            srcPtrRow[j] = srcPtrTemp + (i + j) * srcRowStride;
+        for(int j = 0; j < 8; j++)
+            dstPtrRow[j] = dstPtrTemp + j * dstRowStride + i;
+
+        Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+        for(; vectorLoopCount < alignedCols; vectorLoopCount += vectorIncrement)
+        {
+            __m256 pSrc[4];
+            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[0], &pSrc[0]);
+            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[1], &pSrc[1]);
+            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[2], &pSrc[2]);
+            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[3], &pSrc[3]);
+
+            __m128 pDst[8];
+            compute_transpose4x8_avx(pSrc, pDst);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[0], &pDst[0]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[1], &pDst[1]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[2], &pDst[2]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[3], &pDst[3]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[4], &pDst[4]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[5], &pDst[5]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[6], &pDst[6]);
+            rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[7], &pDst[7]);
+
+            srcPtrRow[0] += vectorIncrement;
+            srcPtrRow[1] += vectorIncrement;
+            srcPtrRow[2] += vectorIncrement;
+            srcPtrRow[3] += vectorIncrement;
+            dstPtrRow[0] += dstRowVectorStride;
+            dstPtrRow[1] += dstRowVectorStride;
+            dstPtrRow[2] += dstRowVectorStride;
+            dstPtrRow[3] += dstRowVectorStride;
+            dstPtrRow[4] += dstRowVectorStride;
+            dstPtrRow[5] += dstRowVectorStride;
+            dstPtrRow[6] += dstRowVectorStride;
+            dstPtrRow[7] += dstRowVectorStride;
+        }
+#endif
+    }
+
+    // handle remaining columns
+    for(Rpp32s k = 0; k < alignedRows; k++)
+    {
+        Rpp32f *srcPtrRowTemp = srcPtrTemp + k * srcRowStride + alignedCols;
+        Rpp32f *dstPtrRowTemp = dstPtrTemp + alignedCols * dstRowStride + k;
+        for(Rpp32s j = alignedCols; j < width; j++)
+        {
+            *dstPtrRowTemp = *srcPtrRowTemp++;
+            dstPtrRowTemp += dstRowStride;
+        }
+    }
+
+    // handle remaining rows
+    for( ; i < height; i++)
+    {
+        Rpp32f *srcPtrRowTemp = srcPtrTemp + i * srcRowStride;
+        Rpp32f *dstPtrRowTemp = dstPtrTemp + i;
+        for(Rpp32s j = 0; j < width; j++)
+        {
+            *dstPtrRowTemp = *srcPtrRowTemp;
+            srcPtrRowTemp++;
+            dstPtrRowTemp += dstRowStride;
+        }
+    }
+}
+
+template<typename T>
+void transpose_generic_nd_recursive(T *dst, Rpp32u *dstStrides, T *src, Rpp32u *srcStrides, Rpp32u *dstShape, Rpp32u tensorDims)
+{
+    // exit case for recursion
+    if (tensorDims == 0)
+    {
+        *dst = *src;
+    }
+    else
+    {
+        for (int i = 0; i < *dstShape; i++)
+        {
+            transpose_generic_nd_recursive(dst, dstStrides + 1, src, srcStrides + 1, dstShape + 1, tensorDims - 1);
+            dst += *dstStrides;
+            src += *srcStrides;
+        }
+    }
+}
+
+template<typename T>
+void transpose_generic_setup_and_run(T *srcPtrTemp, T *dstPtrTemp, Rpp32u *length, Rpp32u *perm, Rpp32u tensorDims)
+{
+    Rpp32u dstShape[RPPT_MAX_DIMS];
+    Rpp32u srcStrides[RPPT_MAX_DIMS];
+    Rpp32u dstStrides[RPPT_MAX_DIMS];
+
+    // compute output shape
+    for(Rpp32u i = 0; i < tensorDims; i++)
+        dstShape[i] = length[perm[i]];
+
+    // compute output strides
+    compute_strides(dstStrides, dstShape, tensorDims);
+
+    // compute input strides and update as per the permute order
+    Rpp32u tempStrides[RPPT_MAX_DIMS];
+    compute_strides(tempStrides, length, tensorDims);
+    for(int i = 0; i < tensorDims; i++)
+        srcStrides[i] = tempStrides[perm[i]];
+
+    // perform transpose as per the permute order
+    transpose_generic_nd_recursive(dstPtrTemp, dstStrides, srcPtrTemp, srcStrides, dstShape, tensorDims);
+}
+
+RppStatus transpose_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                        RpptGenericDescPtr srcGenericDescPtr,
+                                        Rpp32f *dstPtr,
+                                        RpptGenericDescPtr dstGenericDescPtr,
+                                        Rpp32u *permTensor,
+                                        Rpp32u *roiTensor,
+                                        rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u tensorDims = dstGenericDescPtr->numDims - 1;  // exclude batchsize from input dims
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32f *srcPtrTemp, *dstPtrTemp;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        // get the starting address of begin and length values from roiTensor
+        Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[tensorDims];
+        Rpp32u *perm = permTensor;
+
+        bool copyInput = true;
+        for(int i = 0; i < tensorDims; i++)
+            copyInput *= (perm[i] == i);
+
+        // do memcpy of input to output since output order is same as input order
+        if(copyInput)
+        {
+            memcpy(dstPtrTemp, srcPtrTemp, (size_t)(srcGenericDescPtr->strides[0] * sizeof(Rpp32f)));
+        }
+        else
+        {
+            for(int i = 1; i < tensorDims; i++)
+                srcPtrTemp += begin[i - 1] * srcGenericDescPtr->strides[i];
+
+            if (tensorDims == 2 && perm[0] == 1 && perm[1] == 0)
+            {
+                // Optimized AVX version for 2D PLN1 inputs
+                compute_2d_pln1_transpose(srcPtrTemp, dstPtrTemp, length[0], length[1], srcGenericDescPtr->strides[1], dstGenericDescPtr->strides[1]);
+            }
+            else if (tensorDims == 3)
+            {
+                // Optimized AVX version for 3D inputs of shape(x, y, 16) and permutation order (2, 0, 1) (usecases : Deepcam training)
+                if(perm[0] == 2 && perm[1] == 0 && perm[2] == 1 && length[2] == 16)
+                {
+                    Rpp32u height = length[0];
+                    Rpp32u width = length[1];
+                    Rpp32u channels = 16;
+                    Rpp32u bufferLength = width * channels;
+                    Rpp32u alignedLength = bufferLength & ~63;
+                    Rpp32u vectorIncrement = 64;
+                    Rpp32u vectorIncrementPerChannel = 4;
+
+                    // initialize pointers for 16 channel
+                    Rpp32f *dstPtrChannel[16];
+                    for(int i = 0; i < 16; i++)
+                        dstPtrChannel[i] = dstPtrTemp + i * dstGenericDescPtr->strides[1];
+
+                    // loop over rows
+                    for(int i = 0; i < height; i++)
+                    {
+                        Rpp32f *srcPtrRow = srcPtrTemp;
+
+                        // update temporary pointers for 16 channel
+                        Rpp32f *dstPtrTempChannel[16];
+                        for(int k = 0; k < 16; k++)
+                            dstPtrTempChannel[k] = dstPtrChannel[k];
+
+                        Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+                        for( ; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                        {
+                            __m256 pSrc[8];
+                            // load 64 values for source
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow, &pSrc[0]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 16, &pSrc[1]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 32, &pSrc[2]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 48, &pSrc[3]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 8, &pSrc[4]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 24, &pSrc[5]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 40, &pSrc[6]);
+                            rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 56, &pSrc[7]);
+
+                            __m128 pDst[16];
+                            compute_transpose4x8_avx(&pSrc[0], &pDst[0]);
+                            compute_transpose4x8_avx(&pSrc[4], &pDst[8]);
+
+                            // store 4 values in output per channel
+                            rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[0], &pDst[0]);
+                            rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[4], &pDst[4]);
+                            rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[8], &pDst[8]);
+                            rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[12], &pDst[12]);
+
+                            srcPtrRow += vectorIncrement;
+                            increment_ndim_ptr(dstPtrTempChannel, 16, vectorIncrementPerChannel);
+                        }
+#endif
+                        for( ; vectorLoopCount < bufferLength; vectorLoopCount += 16)
+                        {
+                            for(int k = 0; k < 16; k++)
+                                *dstPtrTempChannel[k] = srcPtrRow[k];
+
+                            srcPtrRow += 16;
+                            increment_ndim_ptr(dstPtrTempChannel, 16, 1);
+                        }
+                        srcPtrTemp += srcGenericDescPtr->strides[1];
+                        increment_ndim_ptr(dstPtrChannel, 16, dstGenericDescPtr->dims[3]);
+                    }
+                }
+                // Optimized AVX version for 3D inputs and permutation order (1, 0, 2)
+                else if(perm[0] == 1 && perm[1] == 0 && perm[2] == 2)
+                {
+                    Rpp32f *srcPtrRow = srcPtrTemp;
+                    Rpp32f *dstPtrRow = dstPtrTemp;
+                    Rpp32u height = length[0];
+                    Rpp32u width = length[1];
+                    Rpp32u channels = length[2];
+                    Rpp32u copySizeInBytes = channels * sizeof(Rpp32f);
+                    for(int i = 0; i < height; i++)
+                    {
+                        Rpp32f *srcPtrRowTemp = srcPtrRow;
+                        Rpp32f *dstPtrRowTemp = dstPtrRow;
+                        for(int j = 0; j < width; j++)
+                        {
+                            memcpy(dstPtrRowTemp, srcPtrRowTemp, copySizeInBytes);
+                            srcPtrRowTemp += srcGenericDescPtr->strides[2];
+                            dstPtrRowTemp += dstGenericDescPtr->strides[1];
+                        }
+                        srcPtrRow += srcGenericDescPtr->strides[1];
+                        dstPtrRow += dstGenericDescPtr->strides[2];
+                    }
+                }
+                // Optimized AVX version for 3D inputs and permutation order (0, 2, 1)
+                else if(perm[0] == 0 && perm[1] == 2 && perm[2] == 1)
+                {
+                    Rpp32f *srcPtrRow = srcPtrTemp;
+                    Rpp32f *dstPtrRow = dstPtrTemp;
+                    for(int i = 0; i < length[0]; i++)
+                    {
+                        compute_2d_pln1_transpose(srcPtrTemp, dstPtrTemp, length[1], length[2], srcGenericDescPtr->strides[2], dstGenericDescPtr->strides[2]);
+
+                        // increment src and dst pointers
+                        srcPtrTemp += srcGenericDescPtr->strides[1];
+                        dstPtrTemp += dstGenericDescPtr->strides[1];
+                    }
+                }
+                else
+                {
+                    transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+                }
+            }
+            else if (tensorDims == 4)
+            {
+                // Optimized AVX version for 4D inputs and permutation order (1, 2, 3, 0)
+                Rpp32u vectorIncrement = 8;
+                if(perm[0] == 1 && perm[1] == 2 && perm[2] == 3 && perm[3] == 0)
+                {
+                    Rpp32u bufferLength = length[perm[3]];
+                    Rpp32u alignedLength = bufferLength & ~7;
+                    Rpp32f *srcPtr0 = srcPtrTemp;
+                    Rpp32f *dstPtr0 = dstPtrTemp;
+                    Rpp32u stridesIncrement[8] = {0, srcGenericDescPtr->strides[1], 2 * srcGenericDescPtr->strides[1], 3 * srcGenericDescPtr->strides[1],
+                                                  4 * srcGenericDescPtr->strides[1], 5 * srcGenericDescPtr->strides[1], 6 * srcGenericDescPtr->strides[1], 7 * srcGenericDescPtr->strides[1]};
+                    Rpp32u srcIncrement = vectorIncrement * srcGenericDescPtr->strides[1];
+                    for(int i = 0; i < length[perm[0]]; i++)
+                    {
+                        Rpp32f *srcPtr1 = srcPtr0;
+                        Rpp32f *dstPtr1 = dstPtr0;
+                        for(int j = 0; j < length[perm[1]]; j++)
+                        {
+                            Rpp32f *srcPtr2 = srcPtr1;
+                            Rpp32f *dstPtr2 = dstPtr1;
+                            for(int k = 0; k < length[perm[2]]; k++)
+                            {
+                                Rpp32f *srcPtr3 = srcPtr2;
+                                Rpp32f *dstPtr3 = dstPtr2;
+
+                                Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+                                for( ; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                                {
+                                    __m256 pSrc = _mm256_setr_ps(srcPtr3[stridesIncrement[0]], srcPtr3[stridesIncrement[1]], srcPtr3[stridesIncrement[2]], srcPtr3[stridesIncrement[3]],
+                                                                 srcPtr3[stridesIncrement[4]], srcPtr3[stridesIncrement[5]], srcPtr3[stridesIncrement[6]], srcPtr3[stridesIncrement[7]]);
+                                    rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtr3, &pSrc);
+                                    srcPtr3 += srcIncrement;
+                                    dstPtr3 += vectorIncrement;
+                                }
+#endif
+                                for( ; vectorLoopCount < bufferLength; vectorLoopCount++)
+                                {
+                                    *dstPtr3++ = *srcPtr3;
+                                    srcPtr3 += srcGenericDescPtr->strides[1];
+                                }
+                                srcPtr2 += 1;
+                                dstPtr2 += dstGenericDescPtr->strides[3];
+                            }
+                            srcPtr1 += srcGenericDescPtr->strides[3];
+                            dstPtr1 += dstGenericDescPtr->strides[2];
+                        }
+                        srcPtr0 += srcGenericDescPtr->strides[2];
+                        dstPtr0 += dstGenericDescPtr->strides[1];
+                    }
+                }
+                else
+                {
+                    transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+                }
+            }
+            else
+            {
+                transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+template<typename T>
+RppStatus transpose_generic_host_tensor(T *srcPtr,
+                                        RpptGenericDescPtr srcGenericDescPtr,
+                                        T *dstPtr,
+                                        RpptGenericDescPtr dstGenericDescPtr,
+                                        Rpp32u *permTensor,
+                                        Rpp32u *roiTensor,
+                                        rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u tensorDims = dstGenericDescPtr->numDims - 1;  // exclude batchsize from input dims
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        T *srcPtrTemp, *dstPtrTemp;
+        srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        // get the starting address of begin and length values from roiTensor
+        Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[tensorDims];
+        Rpp32u *perm = permTensor;
+
+        bool copyInput = true;
+        for(int i = 0; i < tensorDims; i++)
+            copyInput *= (perm[i] == i);
+
+        // do memcpy of input to output since output order is same as input order
+        if(copyInput)
+        {
+            memcpy(dstPtrTemp, srcPtrTemp, (size_t)(srcGenericDescPtr->strides[0] * sizeof(T)));
+        }
+        else
+        {
+            for(int i = 1; i < tensorDims; i++)
+                srcPtrTemp += begin[i - 1] * srcGenericDescPtr->strides[i];
+            transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp
index 42e72db98..08eb93674 100644
--- a/src/modules/hip/handlehip.cpp
+++ b/src/modules/hip/handlehip.cpp
@@ -239,7 +239,12 @@ struct HandleImpl
         }
 
         hipMalloc(&(this->initHandle->mem.mgpu.rgbArr.rgbmem), sizeof(RpptRGB) * this->nBatchSize);
-        hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 8294400);    // 3840 x 2160
+
+        /* (600000 + 293 + 128) * 128 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 128)
+           - 600000 is the maximum size that will be required for MMS buffer based on Librispeech dataset
+           - 293 is the size required for storing reduction outputs for 600000 size sample
+           - 128 is the size required for storing cutOffDB values for batch size 128 */
+        hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888);
     }
 };
 
diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 37d2220b2..59e4ba3f9 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
 #include "kernel/subtract_scalar.hpp"
 #include "kernel/multiply_scalar.hpp"
 #include "kernel/magnitude.hpp"
+#include "kernel/log.hpp"
 
 #endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp
new file mode 100644
index 000000000..6db11e222
--- /dev/null
+++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp
@@ -0,0 +1,32 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
+#define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
+
+#include "kernel/non_silent_region_detection.hpp"
+#include "kernel/down_mixing.hpp"
+#include "kernel/to_decibels.hpp"
+
+#endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp
index abdfd30ab..12e80a1f4 100644
--- a/src/modules/hip/hip_tensor_effects_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp
@@ -31,6 +31,8 @@ SOFTWARE.
 #include "kernel/noise_shot.hpp"
 #include "kernel/noise_gaussian.hpp"
 #include "kernel/non_linear_blend.hpp"
+#include "kernel/jitter.hpp"
+#include "kernel/glitch.hpp"
 #include "kernel/water.hpp"
 #include "kernel/ricap.hpp"
 #include "kernel/vignette.hpp"
diff --git a/src/modules/hip/hip_tensor_geometric_augmentations.hpp b/src/modules/hip/hip_tensor_geometric_augmentations.hpp
index dcd890139..102e7d686 100644
--- a/src/modules/hip/hip_tensor_geometric_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_geometric_augmentations.hpp
@@ -35,6 +35,8 @@ SOFTWARE.
 #include "kernel/resize_crop_mirror.hpp"
 #include "kernel/phase.hpp"
 #include "kernel/slice.hpp"
+#include "kernel/lens_correction.hpp"
+#include "kernel/transpose.hpp"
 #include "kernel/crop_and_patch.hpp"
 #include "kernel/flip_voxel.hpp"
 
diff --git a/src/modules/hip/kernel/down_mixing.hpp b/src/modules/hip/kernel/down_mixing.hpp
new file mode 100644
index 000000000..041780e32
--- /dev/null
+++ b/src/modules/hip/kernel/down_mixing.hpp
@@ -0,0 +1,72 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+__global__ void down_mixing_hip_tensor(float *srcPtr,
+                                       uint srcStride,
+                                       float *dstPtr,
+                                       uint dstStride,
+                                       int2 *srcDimsTensor)
+
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    int srcLength = srcDimsTensor[id_z].x;
+    int channels = srcDimsTensor[id_z].y;
+
+    if (id_x >= srcLength)
+        return;
+
+    float outVal = 0.0f;
+    uint srcIdx = id_z * srcStride + id_x * channels;
+    int i = 0;
+    int alignedChannels = (channels / 8) * 8;
+
+    // do 8 pixel load till alignedChannels value
+    if (alignedChannels)
+    {
+        d_float8 outVal_f8;
+        outVal_f8.f4[0] = static_cast<float4>(0.0f);
+        outVal_f8.f4[1] = outVal_f8.f4[0];
+        for(; i < alignedChannels; i += 8, srcIdx += 8)
+        {
+            d_float8 src_f8;
+            rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+            rpp_hip_math_add8(&outVal_f8, &src_f8, &outVal_f8);
+        }
+        outVal_f8.f4[0] += outVal_f8.f4[1];
+        outVal += (outVal_f8.f1[0] + outVal_f8.f1[1] + outVal_f8.f1[2] + outVal_f8.f1[3]);
+    }
+    // process remaining channels
+    for(; i < channels; i++, srcIdx++)
+        outVal += srcPtr[srcIdx];
+    outVal *= (1.f / channels);
+
+    uint dstIdx = id_z * dstStride + id_x;
+    dstPtr[dstIdx] = outVal;
+}
+
+RppStatus hip_exec_down_mixing_tensor(Rpp32f *srcPtr,
+                                      RpptDescPtr srcDescPtr,
+                                      Rpp32f *dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      Rpp32s *srcDimsTensor,
+                                      bool normalizeWeights,
+                                      rpp::Handle& handle)
+{
+    Rpp32s globalThreads_x = dstDescPtr->strides.nStride;
+    Rpp32s globalThreads_y = 1;
+    Rpp32s globalThreads_z = dstDescPtr->n;
+
+    hipLaunchKernelGGL(down_mixing_hip_tensor,
+                       dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                       dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                       0,
+                       handle.GetStream(),
+                       srcPtr,
+                       srcDescPtr->strides.nStride,
+                       dstPtr,
+                       dstDescPtr->strides.nStride,
+                       reinterpret_cast<int2 *>(srcDimsTensor));
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/erase.hpp b/src/modules/hip/kernel/erase.hpp
index 2591b53f0..f18306a9d 100644
--- a/src/modules/hip/kernel/erase.hpp
+++ b/src/modules/hip/kernel/erase.hpp
@@ -117,12 +117,34 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
     int globalThreads_y = dstDescPtr->h;
     int globalThreads_z = handle.GetBatchSize();
 
-    if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+    if (dstDescPtr->layout == RpptLayout::NHWC)
     {
-        if (srcDescPtr->dataType == RpptDataType::U8)
+        // if src layout is NHWC, copy src to dst
+        if (srcDescPtr->layout == RpptLayout::NHWC)
         {
-            hipMemcpyAsync(dstPtr, srcPtr, static_cast<size_t>(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp8u)), hipMemcpyDeviceToDevice, handle.GetStream());
+            hipMemcpyAsync(dstPtr, srcPtr, static_cast<size_t>(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(T)), hipMemcpyDeviceToDevice, handle.GetStream());
             hipStreamSynchronize(handle.GetStream());
+        }
+        // if src layout is NCHW, convert src from NCHW to NHWC
+        else if (srcDescPtr->layout == RpptLayout::NCHW)
+        {
+            globalThreads_x = (dstDescPtr->w + 7) >> 3;
+            hipLaunchKernelGGL(convert_pln3_pkd3_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                               roiTensorPtrSrc);
+            globalThreads_x = dstDescPtr->w;
+            hipStreamSynchronize(handle.GetStream());
+        }
+
+        if (srcDescPtr->dataType == RpptDataType::U8)
+        {
             hipLaunchKernelGGL(erase_pkd_hip_tensor,
                                dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
                                dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -137,8 +159,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
         }
         else if (srcDescPtr->dataType == RpptDataType::F16)
         {
-            hipMemcpyAsync(dstPtr, srcPtr, static_cast<size_t>(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp16f)), hipMemcpyDeviceToDevice, handle.GetStream());
-            hipStreamSynchronize(handle.GetStream());
             hipLaunchKernelGGL(erase_pkd_hip_tensor,
                                dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
                                dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -153,8 +173,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
         }
         else if (srcDescPtr->dataType == RpptDataType::F32)
         {
-            hipMemcpyAsync(dstPtr, srcPtr, static_cast<size_t>(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp32f)), hipMemcpyDeviceToDevice, handle.GetStream());
-            hipStreamSynchronize(handle.GetStream());
             hipLaunchKernelGGL(erase_pkd_hip_tensor,
                                dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
                                dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -169,8 +187,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
         }
         else if (srcDescPtr->dataType == RpptDataType::I8)
         {
-            hipMemcpyAsync(dstPtr, srcPtr, static_cast<size_t>(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp8s)), hipMemcpyDeviceToDevice, handle.GetStream());
-            hipStreamSynchronize(handle.GetStream());
             hipLaunchKernelGGL(erase_pkd_hip_tensor,
                                dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
                                dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -245,33 +261,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
                                numBoxesTensor,
                                roiTensorPtrSrc);
         }
-        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
-        {
-            globalThreads_x = (dstDescPtr->w + 7) >> 3;
-            hipLaunchKernelGGL(convert_pln3_pkd3_hip_tensor,
-                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
-                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
-                               0,
-                               handle.GetStream(),
-                               srcPtr,
-                               make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
-                               dstPtr,
-                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
-                               roiTensorPtrSrc);
-            hipStreamSynchronize(handle.GetStream());
-            globalThreads_x = dstDescPtr->w;
-            hipLaunchKernelGGL(erase_pkd_hip_tensor,
-                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
-                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
-                               0,
-                               handle.GetStream(),
-                               dstPtr,
-                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
-                               anchorBoxInfoTensor,
-                               colorsTensor,
-                               numBoxesTensor,
-                               roiTensorPtrSrc);
-        }
     }
 
     return RPP_SUCCESS;
diff --git a/src/modules/hip/kernel/glitch.hpp b/src/modules/hip/kernel/glitch.hpp
new file mode 100644
index 000000000..81c7013c0
--- /dev/null
+++ b/src/modules/hip/kernel/glitch.hpp
@@ -0,0 +1,278 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+template <typename T>
+__device__ __forceinline__ void rpp_hip_load1_glitch(T *srcPtr, uint2 srcStrideCH, float &locSrcX, float &locSrcY, float *dst, int channels)
+{
+    int srcIdx = locSrcY * srcStrideCH.y + locSrcX * srcStrideCH.x + channels;
+    rpp_hip_interpolate1_nearest_neighbor_load_pln1(srcPtr + srcIdx, dst);
+}
+
+template <typename T>
+__device__ __forceinline__ void rpp_hip_load8_glitch(T *srcPtr, uint2 srcStrideCH, d_float8 *srcX_f8, d_float8 *srcY_f8, d_float8 *dst_f8, int channels)
+{
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[0], srcY_f8->f1[0], &(dst_f8->f1[0]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[1], srcY_f8->f1[1], &(dst_f8->f1[1]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[2], srcY_f8->f1[2], &(dst_f8->f1[2]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[3], srcY_f8->f1[3], &(dst_f8->f1[3]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[4], srcY_f8->f1[4], &(dst_f8->f1[4]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[5], srcY_f8->f1[5], &(dst_f8->f1[5]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[6], srcY_f8->f1[6], &(dst_f8->f1[6]), channels);
+    rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[7], srcY_f8->f1[7], &(dst_f8->f1[7]), channels);
+}
+
+__device__ void check_locs(d_float8 &xLocVals, d_float8 &yLocVals, RppiPoint offset, RpptROI roiTensorPtrSrc)
+{
+    for(int i = 0; i < 8; i++)
+    {
+        if (xLocVals.f1[i] >= roiTensorPtrSrc.ltrbROI.rb.x || xLocVals.f1[i] < roiTensorPtrSrc.ltrbROI.lt.x || yLocVals.f1[i] >= roiTensorPtrSrc.ltrbROI.rb.y || yLocVals.f1[i] < roiTensorPtrSrc.ltrbROI.lt.y)
+        {
+            xLocVals.f1[i] -= offset.x;
+            yLocVals.f1[i] -= offset.y;
+        }
+    }
+}
+
+__device__ void compute_glitch_locs_hip(int id_x, int id_y, RpptChannelOffsets rgbOffsets, RpptROI roiTensorPtrSrc, d_float24 *srcLocsX_f24, d_float24 *srcLocsY_f24)
+{
+    float4 increment_f4;
+    increment_f4 = make_float4(0.0f, 1.0f, 2.0f, 3.0f);                                         // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs
+
+    srcLocsX_f24->f4[0] = static_cast<float4>(id_x + rgbOffsets.r.x) + increment_f4;            // find R channel srcLocsX 0, 1, 2, 3
+    srcLocsX_f24->f4[1] = srcLocsX_f24->f4[0] + (float4) 4;                                     // find R channel srcLocsX 4, 5, 6, 7
+    srcLocsY_f24->f4[0] = srcLocsY_f24->f4[1] = static_cast<float4>(id_y + rgbOffsets.r.y);     // find R channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7
+    check_locs(srcLocsX_f24->f8[0], srcLocsY_f24->f8[0], rgbOffsets.r, roiTensorPtrSrc);        // check if all srcLocs in roi bounds
+
+    srcLocsX_f24->f4[2] = static_cast<float4>(id_x + rgbOffsets.g.x) + increment_f4;            // find G channel srcLocsX 0, 1, 2, 3
+    srcLocsX_f24->f4[3] = srcLocsX_f24->f4[2] +(float4) 4;                                      // find G channel srcLocsX 4, 5, 6, 7
+    srcLocsY_f24->f4[2] = srcLocsY_f24->f4[3]  = static_cast<float4>(id_y + rgbOffsets.g.y);    // find G channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7
+    check_locs(srcLocsX_f24->f8[1], srcLocsY_f24->f8[1], rgbOffsets.g, roiTensorPtrSrc);        // check if all srcLocs in roi bounds
+
+    srcLocsX_f24->f4[4] = static_cast<float4>(id_x + rgbOffsets.b.x) + increment_f4;            // find B channel srcLocsX 0, 1, 2, 3
+    srcLocsX_f24->f4[5] = srcLocsX_f24->f4[4] + (float4) 4;                                     // find B channel srcLocsX 4, 5, 6, 7
+    srcLocsY_f24->f4[4] = srcLocsY_f24->f4[5] = static_cast<float4>(id_y + rgbOffsets.b.y);     // find B channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7
+    check_locs(srcLocsX_f24->f8[2], srcLocsY_f24->f8[2], rgbOffsets.b, roiTensorPtrSrc);        // check if all srcLocs in roi bounds
+}
+
+template <typename T>
+__global__ void glitch_pkd_hip_tensor(T *srcPtr,
+                                      uint2 srcStridesNH,
+                                      T *dstPtr,
+                                      uint2 dstStridesNH,
+                                      RpptChannelOffsets *rgbOffsetsPtr,
+                                      RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+    RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+    uint2 srcStrideCH = make_uint2(3, srcStridesNH.y);
+    d_float24 dst_f24, srcLocsX_f24, srcLocsY_f24;
+
+    compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &(dst_f24.f8[0]), 0);
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &(dst_f24.f8[1]), 1);
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &(dst_f24.f8[2]), 2);
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template <typename T>
+__global__ void glitch_pln_hip_tensor(T *srcPtr,
+                                      uint3 srcStridesNCH,
+                                      T *dstPtr,
+                                      uint3 dstStridesNCH,
+                                      RpptChannelOffsets *rgbOffsetsPtr,
+                                      RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+    RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+    uint2 srcStrideCH = make_uint2(1, srcStridesNCH.z);
+
+    d_float24 srcLocsX_f24, srcLocsY_f24;
+    d_float8 dst_f8;
+
+    compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &dst_f8, 0);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+    srcIdx += srcStridesNCH.y;
+    dstIdx += dstStridesNCH.y;
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &dst_f8, 0);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+    srcIdx += srcStridesNCH.y;
+    dstIdx += dstStridesNCH.y;
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &dst_f8, 0);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template <typename T>
+__global__ void glitch_pkd3_pln3_hip_tensor(T *srcPtr,
+                                            uint2 srcStridesNH,
+                                            T *dstPtr,
+                                            uint3 dstStridesNCH,
+                                            RpptChannelOffsets *rgbOffsetsPtr,
+                                            RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+    RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+    uint2 srcStrideCH = make_uint2(3, srcStridesNH.y);
+
+    d_float24 srcLocsX_f24, srcLocsY_f24;
+    d_float8 dst_f8;
+
+    compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &dst_f8, 0);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+    dstIdx += dstStridesNCH.y;
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &dst_f8, 1);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+    dstIdx += dstStridesNCH.y;
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &dst_f8, 2);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template <typename T>
+__global__ void glitch_pln3_pkd3_hip_tensor(T *srcPtr,
+                                            uint3 srcStridesNCH,
+                                            T *dstPtr,
+                                            uint2 dstStridesNH,
+                                            RpptChannelOffsets *rgbOffsetsPtr,
+                                            RpptROIPtr roiTensorPtrSrc)
+{
+
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+    RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+    uint2 srcStrideCH = make_uint2(1, srcStridesNCH.z);
+
+    d_float24 dst_f24, srcLocsX_f24, srcLocsY_f24;
+    compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &(dst_f24.f8[0]), 0);
+
+    srcIdx += srcStridesNCH.y;
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &(dst_f24.f8[1]), 0);
+
+    srcIdx += srcStridesNCH.y;
+    rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &(dst_f24.f8[2]), 0);
+
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template <typename T>
+RppStatus hip_exec_glitch_tensor(T *srcPtr,
+                                 RpptDescPtr srcDescPtr,
+                                 T *dstPtr,
+                                 RpptDescPtr dstDescPtr,
+                                 RpptChannelOffsets *rgbOffsets,
+                                 RpptROIPtr roiTensorPtrSrc,
+                                 RpptRoiType roiType,
+                                 rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+    int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+    int globalThreads_y = dstDescPtr->h;
+    int globalThreads_z = dstDescPtr->n;
+
+    if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+    {
+        hipLaunchKernelGGL(glitch_pln_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                           rgbOffsets,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+    {
+        hipLaunchKernelGGL(glitch_pln3_pkd3_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                           rgbOffsets,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+    {
+        hipLaunchKernelGGL(glitch_pkd3_pln3_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                           rgbOffsets,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+    {
+        hipLaunchKernelGGL(glitch_pkd_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                           rgbOffsets,
+                           roiTensorPtrSrc);
+    }
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/jitter.hpp b/src/modules/hip/kernel/jitter.hpp
new file mode 100644
index 000000000..bbc407cda
--- /dev/null
+++ b/src/modules/hip/kernel/jitter.hpp
@@ -0,0 +1,314 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+#include "rng_seed_stream.hpp"
+
+__device__ __forceinline__ void jitter_roi_and_srclocs_hip_compute(int4 *srcRoiPtr_i4, RpptXorwowStateBoxMuller *xorwowState, uint kernelSize, uint bound, int id_x, int id_y, d_float16 *locSrc_f16)
+{
+    d_float8 widthIncrement_f8, heightIncrement_f8;
+    rpp_hip_rng_8_xorwow_f32(xorwowState, &widthIncrement_f8);
+    rpp_hip_math_multiply8_const(&widthIncrement_f8, &widthIncrement_f8, static_cast<float4>(kernelSize));
+    rpp_hip_rng_8_xorwow_f32(xorwowState, &heightIncrement_f8);
+    rpp_hip_math_multiply8_const(&heightIncrement_f8, &heightIncrement_f8, static_cast<float4>(kernelSize));
+
+    d_float8 increment_f8, locDst_f8x, locDst_f8y;
+    increment_f8.f4[0] = make_float4(0.0f, 1.0f, 2.0f, 3.0f);   // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs
+    increment_f8.f4[1] = make_float4(4.0f, 5.0f, 6.0f, 7.0f);
+    locDst_f8x.f4[0] = static_cast<float4>(id_x) + increment_f8.f4[0];
+    locDst_f8x.f4[1] = static_cast<float4>(id_x) + increment_f8.f4[1];
+    locDst_f8y.f4[0] = locDst_f8y.f4[1] = (float4)id_y;
+
+    locSrc_f16->f8[0].f4[0] = static_cast<float4>(srcRoiPtr_i4->x) + locDst_f8x.f4[0] + widthIncrement_f8.f4[0] - static_cast<float4>(bound);
+    locSrc_f16->f8[0].f4[1] = static_cast<float4>(srcRoiPtr_i4->x) + locDst_f8x.f4[1] + widthIncrement_f8.f4[1] - static_cast<float4>(bound);
+    locSrc_f16->f8[1].f4[0] = static_cast<float4>(srcRoiPtr_i4->y) + locDst_f8y.f4[0] + heightIncrement_f8.f4[0] - static_cast<float4>(bound);
+    locSrc_f16->f8[1].f4[1] = static_cast<float4>(srcRoiPtr_i4->y) + locDst_f8y.f4[1] + heightIncrement_f8.f4[1] - static_cast<float4>(bound);
+
+    // Apply boundary checks and adjustments
+    for(int i = 0; i < 8; ++i)
+    {
+        locSrc_f16->f1[i] = fmaxf(fminf(floorf(locSrc_f16->f1[i]), static_cast<float>(srcRoiPtr_i4->z - 1)), 0.0f);
+        locSrc_f16->f1[i + 8] = fmaxf(fminf(floorf(locSrc_f16->f1[i + 8]), static_cast<float>(srcRoiPtr_i4->w - bound)), 0.0f);
+    }
+}
+
+template <typename T>
+__global__ void jitter_pkd_tensor(T *srcPtr,
+                                  uint2 srcStridesNH,
+                                  T *dstPtr,
+                                  uint2 dstStridesNH,
+                                  uint *kernelsize,
+                                  RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                  uint *xorwowSeedStream,
+                                  RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3);
+    uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    uint kernelSize = kernelsize[id_z];
+    uint bound = (kernelSize - 1) / 2;
+
+    RpptXorwowStateBoxMuller xorwowState;
+    uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+    xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+    xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+    xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+    xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+    xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+    xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+    int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+    d_float16 locSrc_f16;
+    jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+    d_float24 dst_f24;
+    rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24);
+    rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template <typename T>
+__global__ void jitter_pln_tensor(T *srcPtr,
+                                  uint3 srcStridesNCH,
+                                  T *dstPtr,
+                                  uint3 dstStridesNCH,
+                                  int channelsDst,
+                                  uint *kernelsize,
+                                  RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                  uint *xorwowSeedStream,
+                                  RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+    uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    uint kernelSize = kernelsize[id_z];
+    uint bound = (kernelSize - 1) / 2;
+
+    RpptXorwowStateBoxMuller xorwowState;
+    uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+    xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+    xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+    xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+    xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+    xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+    xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+    int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+    d_float16 locSrc_f16;
+    jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+    d_float8 dst_f8;
+    rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+    if (channelsDst == 3)
+    {
+        srcIdx += srcStridesNCH.y;
+        dstIdx += dstStridesNCH.y;
+
+        rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+        srcIdx += srcStridesNCH.y;
+        dstIdx += dstStridesNCH.y;
+
+        rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+        rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+    }
+}
+
+template <typename T>
+__global__ void jitter_pkd3_pln3_tensor(T *srcPtr,
+                                        uint2 srcStridesNH,
+                                        T *dstPtr,
+                                        uint3 dstStridesNCH,
+                                        uint *kernelsize,
+                                        RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                        uint *xorwowSeedStream,
+                                        RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNH.x);
+    uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+    uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    uint kernelSize = kernelsize[id_z];
+    uint bound = (kernelSize - 1) / 2;
+
+    RpptXorwowStateBoxMuller xorwowState;
+    uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+    xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+    xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+    xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+    xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+    xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+    xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+    int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+    d_float16 locSrc_f16;
+    jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+    d_float24 dst_f24;
+    rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24);
+    rpp_hip_pack_float24_pkd3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template <typename T>
+__global__ void jitter_pln3_pkd3_tensor(T *srcPtr,
+                                        uint3 srcStridesNCH,
+                                        T *dstPtr,
+                                        uint2 dstStridesNH,
+                                        uint *kernelsize,
+                                        RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                        uint *xorwowSeedStream,
+                                        RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * srcStridesNCH.x);
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3);
+    uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+    uint kernelSize = kernelsize[id_z];
+    uint bound = (kernelSize - 1) / 2;
+
+    RpptXorwowStateBoxMuller xorwowState;
+    uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+    xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+    xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+    xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+    xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+    xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+    xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+    int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+    d_float16 locSrc_f16;
+    jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+    d_float24 dst_f24;
+    rpp_hip_interpolate24_nearest_neighbor_pln3(srcPtr + srcIdx, &srcStridesNCH, &locSrc_f16, &srcRoi_i4, &dst_f24);
+    rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template <typename T>
+RppStatus hip_exec_jitter_tensor(T *srcPtr,
+                                 RpptDescPtr srcDescPtr,
+                                 T *dstPtr,
+                                 RpptDescPtr dstDescPtr,
+                                 uint *kernelSizeTensor,
+                                 RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+                                 RpptROIPtr roiTensorPtrSrc,
+                                 RpptRoiType roiType,
+                                 rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+    int globalThreads_y = dstDescPtr->h;
+    int globalThreads_z = dstDescPtr->n;
+
+    Rpp32u *xorwowSeedStream;
+    xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
+    CHECK_RETURN_STATUS(hipMemcpyAsync(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice, handle.GetStream()));
+
+    if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+    {
+        hipLaunchKernelGGL(jitter_pkd_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                           kernelSizeTensor,
+                           xorwowInitialStatePtr,
+                           xorwowSeedStream,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+    {
+        hipLaunchKernelGGL(jitter_pln_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                           dstDescPtr->c,
+                           kernelSizeTensor,
+                           xorwowInitialStatePtr,
+                           xorwowSeedStream,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+    {
+        if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            hipLaunchKernelGGL(jitter_pkd3_pln3_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+                               kernelSizeTensor,
+                               xorwowInitialStatePtr,
+                               xorwowSeedStream,
+                               roiTensorPtrSrc);
+        }
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+            hipLaunchKernelGGL(jitter_pln3_pkd3_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+                               dstPtr,
+                               make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                               kernelSizeTensor,
+                               xorwowInitialStatePtr,
+                               xorwowSeedStream,
+                               roiTensorPtrSrc);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/lens_correction.hpp b/src/modules/hip/kernel/lens_correction.hpp
new file mode 100644
index 000000000..0d53db7e1
--- /dev/null
+++ b/src/modules/hip/kernel/lens_correction.hpp
@@ -0,0 +1,183 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - lens_correction device helpers --------------------
+
+__device__ __forceinline__ void camera_coordinates_hip_compute(d_float8 *cameraCoords_f8, int id_y, d_float8 *locDst_f8x, float3 *inverseMatrix)
+{
+    float4 inverseCoord1_f4 = static_cast<float4>(id_y * inverseMatrix->y + inverseMatrix->z);
+    float4 inverseCoord2_f4 = static_cast<float4>(inverseMatrix->x);
+    cameraCoords_f8->f4[0] = inverseCoord1_f4 + locDst_f8x->f4[0] * inverseCoord2_f4;
+    cameraCoords_f8->f4[1] = inverseCoord1_f4 + locDst_f8x->f4[1] * inverseCoord2_f4;
+}
+
+// -------------------- Set 1 - lens_correction kernels --------------------
+
+// compute inverse of 3x3 camera matrix
+__global__ void compute_inverse_matrix_hip_tensor(d_float9 *matTensor, d_float9 *invMatTensor)
+{
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    d_float9 *mat_f9 = &matTensor[id_z];
+    d_float9 *invMat_f9 = &invMatTensor[id_z];
+
+    // initialize all values in invMat_f9 to zero
+    invMat_f9->f3[0] = static_cast<float3>(0.0f);
+    invMat_f9->f3[1] = invMat_f9->f3[0];
+    invMat_f9->f3[2] = invMat_f9->f3[0];
+
+    // compute determinant mat_f9
+    float det =  (mat_f9->f1[0] * ((mat_f9->f1[4] * mat_f9->f1[8]) - (mat_f9->f1[7] * mat_f9->f1[5])))
+               - (mat_f9->f1[1] * ((mat_f9->f1[3] * mat_f9->f1[8]) - (mat_f9->f1[5] * mat_f9->f1[6])))
+               + (mat_f9->f1[2] * ((mat_f9->f1[3] * mat_f9->f1[7]) - (mat_f9->f1[4] * mat_f9->f1[6])));
+    if(det != 0)
+    {
+        float invDet = 1 / det;
+        invMat_f9->f1[0] = (mat_f9->f1[4] * mat_f9->f1[8] - mat_f9->f1[7] * mat_f9->f1[5]) * invDet;
+        invMat_f9->f1[1] = (mat_f9->f1[2] * mat_f9->f1[7] - mat_f9->f1[1] * mat_f9->f1[8]) * invDet;
+        invMat_f9->f1[2] = (mat_f9->f1[1] * mat_f9->f1[5] - mat_f9->f1[2] * mat_f9->f1[4]) * invDet;
+        invMat_f9->f1[3] = (mat_f9->f1[5] * mat_f9->f1[6] - mat_f9->f1[3] * mat_f9->f1[8]) * invDet;
+        invMat_f9->f1[4] = (mat_f9->f1[0] * mat_f9->f1[8] - mat_f9->f1[2] * mat_f9->f1[6]) * invDet;
+        invMat_f9->f1[5] = (mat_f9->f1[3] * mat_f9->f1[2] - mat_f9->f1[0] * mat_f9->f1[5]) * invDet;
+        invMat_f9->f1[6] = (mat_f9->f1[3] * mat_f9->f1[7] - mat_f9->f1[6] * mat_f9->f1[4]) * invDet;
+        invMat_f9->f1[7] = (mat_f9->f1[6] * mat_f9->f1[1] - mat_f9->f1[0] * mat_f9->f1[7]) * invDet;
+        invMat_f9->f1[8] = (mat_f9->f1[0] * mat_f9->f1[4] - mat_f9->f1[3] * mat_f9->f1[1]) * invDet;
+    }
+}
+
+// compute remap tables from the camera matrix and distortion coefficients
+__global__ void compute_remap_tables_hip_tensor(float *rowRemapTable,
+                                                float *colRemapTable,
+                                                d_float9 *cameraMatrixTensor,
+                                                d_float9 *inverseMatrixTensor,
+                                                d_float8 *distortionCoeffsTensor,
+                                                uint2 remapTableStridesNH,
+                                                RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+        return;
+
+    d_float9 cameraMatrix_f9 = cameraMatrixTensor[id_z];
+    d_float9 inverseMatrix_f9 = inverseMatrixTensor[id_z];
+    d_float8 distortionCoeffs_f8 = distortionCoeffsTensor[id_z];
+
+    // Get radial and tangential distortion coefficients
+    float radialCoeff[6] = {distortionCoeffs_f8.f1[0], distortionCoeffs_f8.f1[1], distortionCoeffs_f8.f1[4], distortionCoeffs_f8.f1[5], distortionCoeffs_f8.f1[6], distortionCoeffs_f8.f1[7]};
+    float tangentialCoeff[2] = {distortionCoeffs_f8.f1[2], distortionCoeffs_f8.f1[3]};
+
+    uint dstIdx = id_z * remapTableStridesNH.x + id_y * remapTableStridesNH.y + id_x;
+    d_float8 locDst_f8x;
+    locDst_f8x.f4[0] = static_cast<float4>(id_x) + make_float4(0, 1, 2, 3);
+    locDst_f8x.f4[1] = static_cast<float4>(id_x) + make_float4(4, 5, 6, 7);
+
+    float4 one_f4 = static_cast<float4>(1.0f);
+    float4 two_f4 = static_cast<float4>(2.0f);
+    d_float8 z_f8, y_f8, x_f8;
+    camera_coordinates_hip_compute(&z_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[2]);          // float zCamera = id_y * inverseMatrix.f1[7] + inverseMatrix.f1[8] + id_x * inverseMatrix.f1[6]
+    camera_coordinates_hip_compute(&y_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[1]);          // float yCamera = id_y * inverseMatrix.f1[4] + inverseMatrix.f1[5] + id_x * inverseMatrix.f1[3]
+    camera_coordinates_hip_compute(&x_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[0]);          // float xCamera = id_y * inverseMatrix.f1[1] + inverseMatrix.f1[2] + id_x * inverseMatrix.f1[0]
+    rpp_hip_math_divide8_const(&z_f8, &z_f8, one_f4);                                           // float z = 1./zCamera
+    rpp_hip_math_multiply8(&y_f8, &z_f8, &y_f8);                                                // float y = yCamera * z;
+    rpp_hip_math_multiply8(&x_f8, &z_f8, &x_f8);                                                // float x = xCamera * z;
+
+    d_float8 ySquare_f8, xSquare_f8;
+    rpp_hip_math_multiply8(&y_f8, &y_f8, &ySquare_f8);                                          // float ySquare = x * x
+    rpp_hip_math_multiply8(&x_f8, &x_f8, &xSquare_f8);                                          // float xSquare = x * x
+
+    d_float8 r2_f8, kr_f8, kr1_f8, kr2_f8;
+    rpp_hip_math_add8(&xSquare_f8, &ySquare_f8, &r2_f8);                                        // float r2 = xSquare + ySquare
+
+    d_float8 r2Cube_f8, r2Square_f8;
+    rpp_hip_math_multiply8(&r2_f8, &r2_f8, &r2Square_f8);                                       // float r2Square = r2 * r2;
+    rpp_hip_math_multiply8(&r2Square_f8, &r2_f8, &r2Cube_f8);                                   // float r2Cube = r2Square * r2;
+
+    d_float24 radialCoeff_f24;
+    radialCoeff_f24.f4[0] = static_cast<float4>(radialCoeff[0]);
+    radialCoeff_f24.f4[1] = static_cast<float4>(radialCoeff[1]);
+    radialCoeff_f24.f4[2] = static_cast<float4>(radialCoeff[2]);
+    radialCoeff_f24.f4[3] = static_cast<float4>(radialCoeff[3]);
+    radialCoeff_f24.f4[4] = static_cast<float4>(radialCoeff[4]);
+    radialCoeff_f24.f4[5] = static_cast<float4>(radialCoeff[5]);
+
+    // float kr = (1 + (radialCoeff[2] * r2Cube) + (radialCoeff[1] * r2Square) + (radialCoeff[0]) * r2)) / (1 + (radialCoeff[5] * r2Cube) + (radialCoeff[4] * r2Square) + (radialCoeff[3]) *r2))
+    kr1_f8.f4[0] = (one_f4 + (radialCoeff_f24.f4[2] * r2Cube_f8.f4[0]) + (radialCoeff_f24.f4[1] *  r2Square_f8.f4[0]) + (radialCoeff_f24.f4[0] *  r2_f8.f4[0]));
+    kr1_f8.f4[1] = (one_f4 + (radialCoeff_f24.f4[2] * r2Cube_f8.f4[1]) + (radialCoeff_f24.f4[1] *  r2Square_f8.f4[1]) + (radialCoeff_f24.f4[0] *  r2_f8.f4[1]));
+    kr2_f8.f4[0] = (one_f4 + (radialCoeff_f24.f4[5] * r2Cube_f8.f4[0]) + (radialCoeff_f24.f4[4] *  r2Square_f8.f4[0]) + (radialCoeff_f24.f4[3] *  r2_f8.f4[0]));
+    kr2_f8.f4[1] = (one_f4 + (radialCoeff_f24.f4[5] * r2Cube_f8.f4[1]) + (radialCoeff_f24.f4[4] *  r2Square_f8.f4[1]) + (radialCoeff_f24.f4[3] *  r2_f8.f4[1]));
+    rpp_hip_math_divide8(&kr1_f8, &kr2_f8, &kr_f8);
+
+    d_float8 xyMul2_f8;
+    rpp_hip_math_multiply8(&x_f8, &y_f8, &xyMul2_f8);
+    rpp_hip_math_multiply8_const(&xyMul2_f8, &xyMul2_f8, two_f4);                               // float xyMul2 = 2 * x * y
+
+    d_float8 colLoc_f8, rowLoc_f8;
+    rpp_hip_math_multiply8_const(&xSquare_f8, &xSquare_f8, two_f4);                             // xSquare = xSquare * 2;
+    rpp_hip_math_multiply8_const(&ySquare_f8, &ySquare_f8, two_f4);                             // ySquare = ySquare * 2;
+
+    d_float16 cameraMatrix_f16;
+    cameraMatrix_f16.f4[0] = static_cast<float4>(cameraMatrix_f9.f1[0]);
+    cameraMatrix_f16.f4[1] = static_cast<float4>(cameraMatrix_f9.f1[2]);
+    cameraMatrix_f16.f4[2] = static_cast<float4>(cameraMatrix_f9.f1[4]);
+    cameraMatrix_f16.f4[3] = static_cast<float4>(cameraMatrix_f9.f1[5]);
+
+    d_float8 tangentialCoeff_f8;
+    tangentialCoeff_f8.f4[0] = static_cast<float4>(tangentialCoeff[0]);
+    tangentialCoeff_f8.f4[1] = static_cast<float4>(tangentialCoeff[1]);
+
+    // float colLoc = cameraMatrix[0] * (x * kr + tangentialCoeff[0] * xyMul2 + tangentialCoeff[1] * (r2 + 2 * xSquare)) + cameraMatrix[2];
+    colLoc_f8.f4[0] = cameraMatrix_f16.f4[0] * ((x_f8.f4[0] * kr_f8.f4[0]) + (tangentialCoeff_f8.f4[0] * xyMul2_f8.f4[0]) + (tangentialCoeff_f8.f4[1] * (r2_f8.f4[0] + xSquare_f8.f4[0]))) + cameraMatrix_f16.f4[1];
+    colLoc_f8.f4[1] = cameraMatrix_f16.f4[0] * ((x_f8.f4[1] * kr_f8.f4[1]) + (tangentialCoeff_f8.f4[0] * xyMul2_f8.f4[1]) + (tangentialCoeff_f8.f4[1] * (r2_f8.f4[1] + xSquare_f8.f4[1]))) + cameraMatrix_f16.f4[1];
+
+    // float rowLoc = cameraMatrix[4] * (y * kr + tangentialCoeff[1] * xyMul2 + tangentialCoeff[0] * (r2 + 2 * ySquare)) + cameraMatrix[4];
+    rowLoc_f8.f4[0] = cameraMatrix_f16.f4[2] * ((y_f8.f4[0] * kr_f8.f4[0]) + (tangentialCoeff_f8.f4[1] * xyMul2_f8.f4[0]) + (tangentialCoeff_f8.f4[0] * (r2_f8.f4[0] + ySquare_f8.f4[0]))) + cameraMatrix_f16.f4[3];
+    rowLoc_f8.f4[1] = cameraMatrix_f16.f4[2] * ((y_f8.f4[1] * kr_f8.f4[1]) + (tangentialCoeff_f8.f4[1] * xyMul2_f8.f4[1]) + (tangentialCoeff_f8.f4[0] * (r2_f8.f4[1] + ySquare_f8.f4[1]))) + cameraMatrix_f16.f4[3];
+
+    rpp_hip_pack_float8_and_store8(colRemapTable + dstIdx, &colLoc_f8);
+    rpp_hip_pack_float8_and_store8(rowRemapTable + dstIdx, &rowLoc_f8);
+}
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+RppStatus hip_exec_lens_correction_tensor(RpptDescPtr dstDescPtr,
+                                          Rpp32f *rowRemapTable,
+                                          Rpp32f *colRemapTable,
+                                          RpptDescPtr remapTableDescPtr,
+                                          Rpp32f *cameraMatrix,
+                                          Rpp32f *distanceCoeffs,
+                                          RpptROIPtr roiTensorPtrSrc,
+                                          RpptRoiType roiType,
+                                          rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+    int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+    int globalThreads_y = dstDescPtr->h;
+    int globalThreads_z = dstDescPtr->n;
+
+    float *inverseMatrix = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+    hipLaunchKernelGGL(compute_inverse_matrix_hip_tensor,
+                       dim3(1, 1, ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                       dim3(1, 1, LOCAL_THREADS_Z),
+                       0,
+                       handle.GetStream(),
+                       reinterpret_cast<d_float9 *>(cameraMatrix),
+                       reinterpret_cast<d_float9 *>(inverseMatrix));
+    hipLaunchKernelGGL(compute_remap_tables_hip_tensor,
+                       dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                       dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                       0,
+                       handle.GetStream(),
+                       rowRemapTable,
+                       colRemapTable,
+                       reinterpret_cast<d_float9 *>(cameraMatrix),
+                       reinterpret_cast<d_float9 *>(inverseMatrix),
+                       reinterpret_cast<d_float8 *>(distanceCoeffs),
+                       make_uint2(remapTableDescPtr->strides.nStride, remapTableDescPtr->strides.hStride),
+                       roiTensorPtrSrc);
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/log.hpp b/src/modules/hip/kernel/log.hpp
new file mode 100644
index 000000000..a481a1e07
--- /dev/null
+++ b/src/modules/hip/kernel/log.hpp
@@ -0,0 +1,232 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 1 - helper kernels --------------------
+template <typename T>
+__device__ void log_hip_compute(T *srcPtr, d_float8 *src_f8, d_float8 *dst_f8)
+{
+    if constexpr (std::is_same<T, schar>::value)
+        rpp_hip_math_add8_const(src_f8, src_f8, (float4)128);
+
+    rpp_hip_math_log(src_f8, dst_f8);
+}
+
+// -------------------- Set 2 - log kernels --------------------
+template <typename T, typename U>
+__global__ void log_1d_hip_tensor(T *srcPtr,
+                                  uint srcStrides,
+                                  U *dstPtr,
+                                  uint dstStrides,
+                                  uint *roiTensor)
+{
+    uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;       // batchsize
+
+    uint *roi = &roiTensor[id_z * 2];
+    uint beginX = roi[0];
+    uint width = roi[1];
+
+    if (id_x >= width)
+        return;
+
+    uint srcIdx = (id_z * srcStrides) + id_x + beginX;
+    uint dstIdx = (id_z * dstStrides) + id_x;
+
+    d_float8 src_f8, dst_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+    log_hip_compute(srcPtr, &src_f8, &dst_f8);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template <typename T, typename U>
+__global__ void log_2d_hip_tensor(T *srcPtr,
+                                  uint2 srcStridesNH,
+                                  U *dstPtr,
+                                  uint2 dstStridesNH,
+                                  uint *roiTensor)
+{
+    uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width
+    uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;       // height
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;       // batchsize
+
+    uint *roi = &roiTensor[id_z * 4];
+    uint beginY = roi[0];
+    uint beginX = roi[1];
+    uint height = roi[2];
+    uint width = roi[3];
+
+    if (id_x >= width || id_y >= height)
+        return;
+
+    uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + beginY) * srcStridesNH.y) + id_x + beginX;
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+
+    d_float8 src_f8, dst_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+    log_hip_compute(srcPtr, &src_f8, &dst_f8);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template <typename T, typename U>
+__global__ void log_3d_hip_tensor(T *srcPtr,
+                                  uint2 srcStridesDH,
+                                  U *dstPtr,
+                                  uint2 dstStridesDH,
+                                  uint *roiTensor)
+{
+    uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+    uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+    uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+    uint *roi = roiTensor;
+    uint beginZ = roi[0];
+    uint beginY = roi[1];
+    uint beginX = roi[2];
+    uint lengthZ = roi[3];
+    uint lengthY = roi[4];
+    uint lengthX = roi[5];
+
+    if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+        return;
+
+    uint srcIdx = ((id_z + beginZ) * srcStridesDH.x) + ((id_y + beginY) * srcStridesDH.y) + id_x + beginX;
+    uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+
+    d_float8 src_f8, dst_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+    log_hip_compute(srcPtr, &src_f8, &dst_f8);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template <typename T, typename U>
+__global__ void log_nd_hip_tensor(T *srcPtr,
+                                  uint *srcStrides,
+                                  uint *srcDims,
+                                  uint numDims,
+                                  U *dstPtr,
+                                  uint *dstStrides,
+                                  Rpp32u *roiTensor)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+    if(id_x >= srcStrides[0])
+        return;
+
+    uint *roi = roiTensor + id_z * numDims * 2;
+    uint *begin = roi;
+    uint *length = &roi[numDims];
+    uint dstIdx = (id_z * *dstStrides++);
+    uint srcIdx = (id_z * *srcStrides++);
+    uint coords[RPPT_MAX_DIMS];
+
+    for (int i = 0; i < numDims; i++)
+    {
+        coords[i] = (id_x / srcStrides[i]) % srcDims[i];
+        if(coords[i] >= length[i])
+            return;
+    }
+
+    for (int i = 0; i < numDims; i++)
+    {
+        dstIdx += (coords[i] * dstStrides[i]);
+        srcIdx += (begin[i] + (coords[i] * srcStrides[i]));
+    }
+
+    d_float8 src_f8, dst_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+    log_hip_compute(srcPtr, &src_f8, &dst_f8);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+// -------------------- Set 3 - executor kernels --------------------
+template <typename T, typename U>
+RppStatus hip_exec_log_generic_tensor(T *srcPtr,
+                                      RpptGenericDescPtr srcGenericDescPtr,
+                                      U *dstPtr,
+                                      RpptGenericDescPtr dstGenericDescPtr,
+                                      uint *roiTensor,
+                                      rpp::Handle& handle)
+{
+    Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+    // based on number of dimensions call the corresponding kernel
+    if (numDims == 1)
+    {
+        // NW
+        int globalThreads_x = dstGenericDescPtr->dims[1];
+        int globalThreads_y = 1;
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        hipLaunchKernelGGL(log_1d_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcGenericDescPtr->strides[0],
+                           dstPtr,
+                           dstGenericDescPtr->strides[0],
+                           roiTensor);
+    }
+    else if (numDims == 2)
+    {
+        // NHW
+        int globalThreads_x = dstGenericDescPtr->dims[2];
+        int globalThreads_y = dstGenericDescPtr->dims[1];
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        hipLaunchKernelGGL(log_2d_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+                           dstPtr,
+                           make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+                           roiTensor);
+    }
+    else if (numDims == 3)
+    {
+        // NDHW
+        int globalThreads_x = dstGenericDescPtr->dims[3];
+        int globalThreads_y = dstGenericDescPtr->dims[2];
+        int globalThreads_z = dstGenericDescPtr->dims[1];
+
+        for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+        {
+            hipLaunchKernelGGL(log_3d_hip_tensor,
+                               dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+                               make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+                               dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+                               make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+                               &roiTensor[batchCount * 6]);
+        }
+    }
+    else
+    {
+        // interpret the input as 1D tensor
+        int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3;
+        int globalThreads_y = 1;
+        int globalThreads_z = dstGenericDescPtr->dims[0];
+
+        hipLaunchKernelGGL(log_nd_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcGenericDescPtr->strides,
+                           srcGenericDescPtr->dims + 1,
+                           srcGenericDescPtr->numDims - 1,
+                           dstPtr,
+                           dstGenericDescPtr->strides,
+                           roiTensor);
+    }
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/non_silent_region_detection.hpp b/src/modules/hip/kernel/non_silent_region_detection.hpp
new file mode 100644
index 000000000..80511464b
--- /dev/null
+++ b/src/modules/hip/kernel/non_silent_region_detection.hpp
@@ -0,0 +1,426 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 -  moving mean square kernel device helpers --------------------
+
+// calculate the position in shared memory to avoid bank conflicts
+__host__ __device__ __forceinline__ int compute_pos_in_smem(int pos)
+{
+    return pos + (pos >> 5); // since shared memory banks considered is 32
+}
+
+/* compute prefix sum on the input buffer passed
+   prefix sum of an array is an array where each element is the sum of all previous elements in the input array, inclusive of the current element */
+__device__ __forceinline__ void compute_prefix_sum(float *input, uint bufferLength)
+{
+    int offset = 1;
+    int2 offset_i2 = static_cast<int2>(offset);
+    int2 offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+    int threadIdxMul2 = 2 * hipThreadIdx_x;
+    int blockDimMul2 = 2 * hipBlockDim_x;
+
+    /* compute intermediate prefix sums in a up sweep manner
+       (each level in the hierarchy doubles the distance between the pairs of elements being added) */
+    for (int d = bufferLength >> 1; d > 0; d >>= 1)
+    {
+        // syncthreads before proceeding to next iteration
+        __syncthreads();
+        int dMul2 = 2 * d;
+        for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2)
+        {
+            int2 pos_i2 = (offset_i2 * static_cast<int2>(idxMul2)) + offsetAB_i2;
+            input[compute_pos_in_smem(pos_i2.y)] += input[compute_pos_in_smem(pos_i2.x)];
+        }
+        offset <<= 1;
+        offset_i2 =  static_cast<int2>(offset);
+        offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        int last = bufferLength - 1;
+        input[compute_pos_in_smem(last)] = 0;
+    }
+
+    /* compute final prefix sums in a down sweep manner
+       (each level in the hierarchy halves the distance between the pairs of elements being added) */
+    for (int d = 1; d < bufferLength; d <<= 1)
+    {
+        offset >>= 1;
+        offset_i2 = static_cast<int2>(offset);
+        offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+        __syncthreads();
+        // syncthreads before proceeding to next iteration
+
+        int dMul2 = 2 * d;
+        for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2)
+        {
+            int2 pos_i2 = offset_i2 * static_cast<int2>(idxMul2) + offsetAB_i2;
+            int posA = compute_pos_in_smem(pos_i2.x);
+            int posB = compute_pos_in_smem(pos_i2.y);
+            float t = input[posA];
+            input[posA] = input[posB];
+            input[posB] += t;
+        }
+    }
+    __syncthreads();
+}
+
+// -------------------- Set 1 -  moving mean square compute kernel --------------------
+
+__global__ void moving_mean_square_hip_tensor(float *srcPtr,
+                                              uint nStride,
+                                              float *mmsArr,
+                                              int *srcLengthTensor,
+                                              int outputTileLength,
+                                              int windowLength,
+                                              float windowFactor,
+                                              int inputTileLength)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint srcLength = srcLengthTensor[id_z];
+    uint batchStride = id_z * nStride;
+    int blockStart = hipBlockIdx_x * outputTileLength;
+
+    if (blockStart >= srcLength)
+        return;
+
+    float *input = srcPtr + batchStride;
+    extern __shared__ float squaredPrefixSum_smem[];
+
+    float *inBlockPtr = srcPtr + batchStride + blockStart;
+    float *outBlockPtr = mmsArr + batchStride + blockStart;
+
+    // find the valid output tile length values needed for given block
+    int validOutputTileLength = std::min<int>(outputTileLength, srcLength - blockStart);
+
+    // assign pointers that points to block begin and block end locations
+    float *extendedBlockStart = inBlockPtr - windowLength;
+    float *extendedBlockEnd = inBlockPtr + validOutputTileLength;
+
+    // load input data to shared memory
+    for(int pos = hipThreadIdx_x; pos < inputTileLength; pos += hipBlockDim_x)
+    {
+        float val = 0.0f;
+        auto extendedBlockPtr = extendedBlockStart + pos;
+
+        /* check if extendedBlockPtr is within the valid region of input
+           and load the value from extendedBlockPtr if it is within valid region */
+        if (extendedBlockPtr >= input && extendedBlockPtr < extendedBlockEnd)
+            val = *extendedBlockPtr;
+        squaredPrefixSum_smem[compute_pos_in_smem(pos)] = val * val;
+    }
+
+    // compute prefix sum
+    compute_prefix_sum(squaredPrefixSum_smem, inputTileLength);
+
+    // compute the mms value here
+    for(int pos = hipThreadIdx_x; pos < validOutputTileLength; pos += hipBlockDim_x)
+        outBlockPtr[pos] = windowFactor * ((inBlockPtr[pos] * inBlockPtr[pos]) + squaredPrefixSum_smem[compute_pos_in_smem(windowLength + pos)] - squaredPrefixSum_smem[compute_pos_in_smem(pos + 1)]);
+}
+
+// -------------------- Set 2 -  kernels for finding cutoffmag value  --------------------
+
+__global__ void max_reduction_hip_tensor(float *srcPtr,
+                                         uint nStride,
+                                         float *maxArr,
+                                         int *srcLengthTensor)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint srcLength = srcLengthTensor[id_z];
+
+    uint srcIdx = id_z * nStride;
+    __shared__ float max_smem[256];                                     // 256 values of src in a 256 x 1 thread block
+    max_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS to start value using all 256 threads
+
+    if (id_x >= srcLength)
+        return;
+
+    if (id_x + 8 > srcLength)
+        id_x -= (id_x + 8 - srcLength);
+
+    srcIdx += id_x;
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);       // load 8 pixels to local memory
+    rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]);
+    __syncthreads();                                                    // syncthreads after max compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+    {
+        int dstIdx = id_z * hipGridDim_x + hipBlockIdx_x;
+        maxArr[dstIdx] = max_smem[0];
+    }
+}
+
+__global__ void cutoffmag_hip_tensor(float *srcPtr,
+                                     int maxLength,
+                                     float *cutOffMagPtr,
+                                     float cutOff,
+                                     float referencePower,
+                                     bool referenceMax)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    // if referenceMax is set to true, perform final max reduction on srcPtr and compute cutOffMag
+    if(referenceMax)
+    {
+        uint srcIdx = id_z * maxLength;
+        __shared__ float max_smem[256];                                     // 256 values of src in a 256 x 1 thread block
+        max_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS to start value using all 256 threads
+
+        if (id_x >= maxLength)
+            return;
+
+        srcIdx += id_x;
+        float maxVal = srcPtr[srcIdx];
+        while (id_x < maxLength)
+        {
+            maxVal = fmaxf(maxVal, srcPtr[srcIdx]);
+            id_x += hipBlockDim_x;
+            srcIdx += hipBlockDim_x;
+        }
+        max_smem[hipThreadIdx_x] = maxVal;
+        __syncthreads();                                                    // syncthreads after max compute
+
+        // Reduction of 256 floats on 256 threads per block in x dimension
+        for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+        {
+            if (hipThreadIdx_x < threadMax)
+                max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_x == 0)
+            cutOffMagPtr[id_z] = max_smem[0] * cutOff;
+    }
+    else
+    {
+        if (hipThreadIdx_x == 0)
+            cutOffMagPtr[id_z] = referencePower * cutOff;
+    }
+}
+
+// -------------------- Set 3 -  kernels for finding begin and length of NSR in inputs --------------------
+
+__global__ void find_region_hip_tensor(float *srcPtr,
+                                       uint nStride,
+                                       int *beginTensor,
+                                       int *lengthTensor,
+                                       float *cutOffMagPtr,
+                                       int *srcLengthTensor,
+                                       float windowLength)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    uint srcLength = srcLengthTensor[id_z];
+    float cutOffMag = cutOffMagPtr[id_z];
+
+    __shared__ int beginResult;
+    __shared__ int endResult;
+    beginResult = srcLength;
+    endResult = 0;
+    __syncthreads();
+
+    int beginIdx = srcLength;
+    int endIdx = 0;
+    uint stridePerSample = id_z * nStride;
+
+    // Find the begin index in src whose value is >= cutOffMag
+    for (int i = id_x; i < srcLength; i += hipBlockDim_x)
+    {
+        uint srcIdx = stridePerSample + i;
+        if (srcPtr[srcIdx] >= cutOffMag)
+        {
+            beginIdx = i;
+            atomicMin(&beginResult, beginIdx);
+            if(beginResult != srcLength)
+                break;
+        }
+    }
+
+    // Find the end index in src whose value is >= cutOffMag
+    for (int i = id_x; i < srcLength; i += hipBlockDim_x)
+    {
+        uint srcIdx = stridePerSample + srcLength - 1 - i;
+        if (srcPtr[srcIdx] >= cutOffMag)
+        {
+            endIdx = srcLength - 1 - i;
+            atomicMax(&endResult, endIdx);
+            if(endResult != 0)
+                break;
+        }
+    }
+
+    // Final store to dst
+    if(hipThreadIdx_x == 0)
+    {
+        if(beginResult == srcLength || endResult == 0)
+        {
+            beginTensor[id_z] = 0;
+            lengthTensor[id_z] = 0;
+        }
+        else
+        {
+            int detectBegin = beginResult;
+            int detectEnd = endResult - beginResult + 1;
+
+            // if both starting index and length of nonsilent region is not 0
+            // adjust the values as per the windowLength
+            if(detectBegin != 0 && detectEnd != 0)
+            {
+                int newBegin = max(detectBegin - (windowLength - 1), 0);
+                detectEnd += detectBegin - newBegin;
+                detectBegin = newBegin;
+            }
+            beginTensor[id_z] = detectBegin;
+            lengthTensor[id_z] = detectEnd;
+        }
+    }
+}
+
+// -------------------- Set 4 -  host helpers for kernel executor --------------------
+
+// return the nearest previous power of 2 for the given number
+inline Rpp32s prev_pow2(Rpp32s n)
+{
+    Rpp32s pow2 = 1;
+    while (n - pow2 > pow2)
+        pow2 += pow2;
+
+    return pow2;
+}
+
+// return the nearest next power of 2 for the given number
+inline Rpp32s next_pow2(Rpp32s n)
+{
+    Rpp32s pow2 = 1;
+    while (n > pow2)
+        pow2 += pow2;
+
+    return pow2;
+}
+
+// -------------------- Set 5 - non silent region kernels executor --------------------
+
+RppStatus hip_exec_non_silent_region_detection_tensor(Rpp32f *srcPtr,
+                                                      RpptDescPtr srcDescPtr,
+                                                      Rpp32s *srcLengthTensor,
+                                                      Rpp32s *detectedIndexTensor,
+                                                      Rpp32s *detectionLengthTensor,
+                                                      Rpp32f cutOffDB,
+                                                      Rpp32s windowLength,
+                                                      Rpp32f referencePower,
+                                                      Rpp32s resetInterval,
+                                                      rpp::Handle& handle)
+{
+    // check if scratch memory size required for moving mean square is within the limits
+    if ((srcDescPtr->n * srcDescPtr->strides.nStride) > MMS_MAX_SCRATCH_MEMORY)
+        return RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE;
+
+    Rpp32f *mmsArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+    Rpp32s maxSharedMemoryInBytes = handle.GetLocalMemorySize();
+    Rpp32s maxSharedMemoryElements = maxSharedMemoryInBytes / sizeof(Rpp32f);
+    Rpp32s kSharedMemBanks = 32;
+    Rpp32s inputTileLength = prev_pow2(maxSharedMemoryElements * kSharedMemBanks / (kSharedMemBanks + 1));
+
+    if (resetInterval > 0 && resetInterval < inputTileLength)
+    {
+        Rpp32s p = prev_pow2(resetInterval);
+        Rpp32s n = next_pow2(resetInterval);
+        if (p > windowLength)
+            inputTileLength = p;
+        else if (n < inputTileLength)
+            inputTileLength = n;
+    }
+
+    Rpp32s sharedMemorySizeInBytes = compute_pos_in_smem(inputTileLength) * sizeof(Rpp32f);
+    Rpp32s outputTileLength = inputTileLength - windowLength;
+    Rpp32f windowFactor = 1.0f / windowLength;
+
+    if (outputTileLength <= 0)
+        return RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH;
+
+    if (sharedMemorySizeInBytes > maxSharedMemoryInBytes)
+        return RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE;
+
+    // launch kernel to compute the values needed for MMS Array
+    Rpp32s globalThreads_x = ceil(static_cast<Rpp32f>(srcDescPtr->strides.nStride) / outputTileLength);
+    Rpp32s globalThreads_y = 1;
+    Rpp32s globalThreads_z = srcDescPtr->n;
+
+    hipLaunchKernelGGL(moving_mean_square_hip_tensor,
+                       dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+                       dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                       sharedMemorySizeInBytes,
+                       handle.GetStream(),
+                       srcPtr,
+                       srcDescPtr->strides.nStride,
+                       mmsArr,
+                       srcLengthTensor,
+                       outputTileLength,
+                       windowLength,
+                       windowFactor,
+                       inputTileLength);
+
+    const Rpp32f cutOff = std::pow(10.0f, cutOffDB * 0.1f);
+    bool referenceMax = (!referencePower);
+    Rpp32f *partialMaxArr = mmsArr + srcDescPtr->n * srcDescPtr->strides.nStride;
+
+    Rpp32s numBlocksPerSample = ceil(static_cast<Rpp32f>(srcDescPtr->strides.nStride) / (LOCAL_THREADS_X_1DIM * 8));
+    Rpp32s cutOffMagKernelBlockSize = 1;
+    if (referenceMax)
+    {
+        // compute max value in MMS buffer
+        hipLaunchKernelGGL(max_reduction_hip_tensor,
+                           dim3(numBlocksPerSample, 1, globalThreads_z),
+                           dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                           0,
+                           handle.GetStream(),
+                           mmsArr,
+                           srcDescPtr->strides.nStride,
+                           partialMaxArr,
+                           srcLengthTensor);
+        cutOffMagKernelBlockSize = 256;
+    }
+    // find the cutoff value in magnitude
+    Rpp32f *cutOffMagPtr = partialMaxArr + globalThreads_z * numBlocksPerSample;
+    hipLaunchKernelGGL(cutoffmag_hip_tensor,
+                       dim3(1, 1, globalThreads_z),
+                       dim3(cutOffMagKernelBlockSize, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                       0,
+                       handle.GetStream(),
+                       partialMaxArr,
+                       numBlocksPerSample,
+                       cutOffMagPtr,
+                       cutOff,
+                       referencePower,
+                       referenceMax);
+
+    // find the begin and length values of NSR in inputs
+    hipLaunchKernelGGL(find_region_hip_tensor,
+                       dim3(1, 1, globalThreads_z),
+                       dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                       0,
+                       handle.GetStream(),
+                       mmsArr,
+                       srcDescPtr->strides.nStride,
+                       detectedIndexTensor,
+                       detectionLengthTensor,
+                       cutOffMagPtr,
+                       srcLengthTensor,
+                       windowLength);
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/to_decibels.hpp b/src/modules/hip/kernel/to_decibels.hpp
new file mode 100644
index 000000000..e1d45d098
--- /dev/null
+++ b/src/modules/hip/kernel/to_decibels.hpp
@@ -0,0 +1,312 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - to_decibels device helpers --------------------
+
+__device__ __forceinline__ void to_decibels_hip_compute(d_float8 *src_f8, d_float8 *dst_f8, double minRatio, float multiplier, float inverseMagnitude)
+{
+    dst_f8->f1[0] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[0]) * inverseMagnitude)));
+    dst_f8->f1[1] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[1]) * inverseMagnitude)));
+    dst_f8->f1[2] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[2]) * inverseMagnitude)));
+    dst_f8->f1[3] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[3]) * inverseMagnitude)));
+    dst_f8->f1[4] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[4]) * inverseMagnitude)));
+    dst_f8->f1[5] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[5]) * inverseMagnitude)));
+    dst_f8->f1[6] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[6]) * inverseMagnitude)));
+    dst_f8->f1[7] = multiplier * log2(max(minRatio, (static_cast<double>(src_f8->f1[7]) * inverseMagnitude)));
+}
+
+// -------------------- Set 1 -  kernels for finding inverse magnitude value --------------------
+
+__global__ void inverse_magnitude_hip_tensor(float *srcPtr,
+                                             int maxLength,
+                                             bool computeMax,
+                                             float *inverseMagnitudeTensor)
+
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    // Do final reduction on block wise max
+    if (computeMax)
+    {
+        uint srcIdx = id_z * maxLength;
+        __shared__ float max_smem[256];                                     // 256 values of src in a 256 x 1 thread block
+        max_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS to start value using all 256 threads
+
+        if (id_x >= maxLength)
+            return;
+
+        srcIdx += id_x;
+        float maxVal = srcPtr[srcIdx];
+        while (id_x < maxLength)
+        {
+            maxVal = fmaxf(maxVal, srcPtr[srcIdx]);
+            id_x += hipBlockDim_x;
+            srcIdx += hipBlockDim_x;
+        }
+        max_smem[hipThreadIdx_x] = maxVal;
+        __syncthreads();                                                    // syncthreads after max compute
+
+        // Reduction of 256 floats on 256 threads per block in x dimension
+        for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+        {
+            if (hipThreadIdx_x < threadMax)
+                max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_x == 0)
+            inverseMagnitudeTensor[id_z] = 1.f / max_smem[0];
+    }
+    else
+    {
+        inverseMagnitudeTensor[id_z] = 1.0f;
+    }
+}
+
+__global__ void max_reduction_1d_hip_tensor(float *srcPtr,
+                                            uint2 srcStridesNH,
+                                            RpptImagePatchPtr srcDims,
+                                            float *maxArr)
+{
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+
+    uint srcLength = srcDims[id_z].height;
+    uint srcIdx = id_z * srcStridesNH.x;
+    __shared__ float max_smem[256];                                     // 256 values of src in a 256 x 1 thread block
+    max_smem[hipThreadIdx_x] = srcPtr[srcIdx];                          // initialization of LDS to start value using all 256 threads
+
+    if (id_x >= srcLength)
+        return;
+
+    srcIdx += id_x;
+    d_float8 src_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);       // load 8 pixels to local memory
+    rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]);
+    __syncthreads();                                                    // syncthreads after max compute
+
+    // Reduction of 256 floats on 256 threads per block in x dimension
+    for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    // Final store to dst
+    if (hipThreadIdx_x == 0)
+        maxArr[id_z * hipGridDim_x + hipBlockIdx_x] = max_smem[0];
+}
+
+__global__ void max_reduction_2d_hip_tensor(float *srcPtr,
+                                            uint2 srcStridesNH,
+                                            RpptImagePatchPtr srcDims,
+                                            float *maxArr)
+{
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+    __shared__ float partialMax_smem[16][16];                                   // 16 rows of src, 16 reduced cols of src in a 16 x 16 thread block
+    uint srcIdx = (id_z * srcStridesNH.x);
+    float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0];         // float pointer to beginning of each row in LDS
+    partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];                     // initialization of LDS to start value using all 16 x 16 threads
+
+    if ((id_y >= srcDims[id_z].height) || (id_x >= srcDims[id_z].width))
+        return;
+
+    srcIdx += ((id_y * srcStridesNH.y) + id_x);
+    partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];
+    __syncthreads();                                                            // syncthreads
+
+    // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+    for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+    {
+        if (hipThreadIdx_x < threadMax)
+            partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+        __syncthreads();
+    }
+
+    if (hipThreadIdx_x == 0)
+    {
+        // Reduction of 16 floats on 16 threads per block in y dimension
+        for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+        {
+            if (hipThreadIdx_y < threadMax)
+                partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
+            __syncthreads();
+        }
+
+        // Final store to dst
+        if (hipThreadIdx_y == 0)
+            maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
+    }
+}
+
+// -------------------- Set 2 - to decibels kernels --------------------
+
+__global__ void to_decibels_1d_hip_tensor(float *srcPtr,
+                                          uint srcStride,
+                                          float *dstPtr,
+                                          uint dstStride,
+                                          RpptImagePatchPtr srcDims,
+                                          double minRatio,
+                                          float multiplier,
+                                          float *inverseMagnitudeTensor)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if (id_x >= srcDims[id_z].height)
+        return;
+
+    uint srcIdx = (id_z * srcStride) + id_x;
+    float inverseMagnitude = inverseMagnitudeTensor[id_z];
+
+    d_float8 src_f8, dst_f8;
+    rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+    to_decibels_hip_compute(&src_f8, &dst_f8, minRatio, multiplier, inverseMagnitude);
+
+    uint dstIdx = (id_z * dstStride) + id_x;
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+__global__ void to_decibels_2d_hip_tensor(float *srcPtr,
+                                          uint2 srcStridesNH,
+                                          float *dstPtr,
+                                          uint2 dstStridesNH,
+                                          RpptImagePatchPtr srcDims,
+                                          double minRatio,
+                                          float multiplier,
+                                          float *inverseMagnitudeTensor)
+{
+    int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if (id_x >= srcDims[id_z].width || id_y >= srcDims[id_z].height)
+        return;
+
+    uint srcIdx = (id_z * srcStridesNH.x) + (id_y * srcStridesNH.y) + id_x;
+    uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+    float inverseMagnitude = inverseMagnitudeTensor[id_z];
+    dstPtr[dstIdx] = multiplier * log2(max(minRatio, (static_cast<double>(srcPtr[srcIdx]) * inverseMagnitude)));
+}
+
+// -------------------- Set 3 - to decibels kernels executor --------------------
+
+RppStatus hip_exec_to_decibels_tensor(Rpp32f *srcPtr,
+                                      RpptDescPtr srcDescPtr,
+                                      Rpp32f *dstPtr,
+                                      RpptDescPtr dstDescPtr,
+                                      RpptImagePatchPtr srcDims,
+                                      Rpp32f cutOffDB,
+                                      Rpp32f multiplier,
+                                      Rpp32f referenceMagnitude,
+                                      rpp::Handle& handle)
+{
+    Rpp32u numDims = srcDescPtr->numDims - 1;   // exclude batchSize from input dims
+
+    // Calculate the intermediate values needed for DB conversion
+    Rpp32f minRatio = std::pow(10, cutOffDB / multiplier);
+    if(!minRatio)
+        minRatio = std::nextafter(0.0f, 1.0f);
+    const Rpp32f log10Factor = 0.3010299956639812;      //1 / std::log(10);
+    multiplier *= log10Factor;
+
+    // calculate max in input if referenceMagnitude = 0
+    Rpp32f *partialMaxArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+    Rpp32s numBlocksPerSample = 0;
+    Rpp32s globalThreads_z = dstDescPtr->n;
+
+    // find the invReferenceMagnitude value
+    bool computeMax = (!referenceMagnitude);
+    if(computeMax)
+    {
+        if (numDims == 1)
+        {
+            numBlocksPerSample = ceil(static_cast<Rpp32f>((srcDescPtr->strides.nStride + 7) >> 3) / LOCAL_THREADS_X_1DIM);
+            hipLaunchKernelGGL(max_reduction_1d_hip_tensor,
+                               dim3(numBlocksPerSample, 1, globalThreads_z),
+                               dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcDescPtr->strides.nStride, 1),
+                               srcDims,
+                               partialMaxArr);
+        }
+        else if (numDims == 2)
+        {
+            Rpp32s gridDim_x = ceil(static_cast<Rpp32f>((srcDescPtr->strides.hStride)/LOCAL_THREADS_X));
+            Rpp32s gridDim_y = ceil(static_cast<Rpp32f>(srcDescPtr->h)/LOCAL_THREADS_Y);
+            Rpp32s gridDim_z = ceil(static_cast<Rpp32f>(globalThreads_z)/LOCAL_THREADS_Z);
+            numBlocksPerSample = gridDim_x * gridDim_y * gridDim_z;
+            hipLaunchKernelGGL(max_reduction_2d_hip_tensor,
+                               dim3(gridDim_x, gridDim_y, gridDim_z),
+                               dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                               srcDims,
+                               partialMaxArr);
+        }
+        hipStreamSynchronize(handle.GetStream());
+    }
+    Rpp32u blockSize = (computeMax) ? 256: 1;
+    Rpp32f *inverseMagnitudeTensor = partialMaxArr + globalThreads_z * numBlocksPerSample;
+    hipLaunchKernelGGL(inverse_magnitude_hip_tensor,
+                       dim3(1, 1,  globalThreads_z),
+                       dim3(blockSize, 1, 1),
+                       0,
+                       handle.GetStream(),
+                       partialMaxArr,
+                       numBlocksPerSample,
+                       computeMax,
+                       inverseMagnitudeTensor);
+    hipStreamSynchronize(handle.GetStream());
+
+    // launch kernel for todecibels
+    if (numDims == 1)
+    {
+        Rpp32s globalThreads_x = (srcDescPtr->strides.nStride + 7) >> 3;
+        Rpp32s globalThreads_y = 1;
+        hipLaunchKernelGGL(to_decibels_1d_hip_tensor,
+                           dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcDescPtr->strides.nStride,
+                           dstPtr,
+                           dstDescPtr->strides.nStride,
+                           srcDims,
+                           static_cast<double>(minRatio),
+                           multiplier,
+                           inverseMagnitudeTensor);
+    }
+    else if (numDims == 2)
+    {
+        Rpp32s globalThreads_x = srcDescPtr->strides.hStride;
+        Rpp32s globalThreads_y = srcDescPtr->h;
+        hipLaunchKernelGGL(to_decibels_2d_hip_tensor,
+                           dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z)),
+                           dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+                           dstPtr,
+                           make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+                           srcDims,
+                           static_cast<double>(minRatio),
+                           multiplier,
+                           inverseMagnitudeTensor);
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/transpose.hpp b/src/modules/hip/kernel/transpose.hpp
new file mode 100644
index 000000000..83f2ba700
--- /dev/null
+++ b/src/modules/hip/kernel/transpose.hpp
@@ -0,0 +1,105 @@
+#include <hip/hip_runtime.h>
+#include "rpp_hip_common.hpp"
+
+// Vectorized dst->src mapping
+template <typename T>
+__global__ void transpose_generic_hip_tensor(T *srcPtr,
+                                             uint *srcStrides,
+                                             T *dstPtr,
+                                             uint *dstStrides,
+                                             uint *dstDims,
+                                             uint tensorDims,
+                                             uint *permTensor)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+
+    if(id_x >= dstStrides[0])
+        return;
+
+    int maxLength = dstStrides[0];
+    int xDiff = maxLength - (maxLength & ~7);    // difference between maxLength and alignedLength. (alignedLength = maxLength & ~7)
+
+    // Point dstIdx and srcIdx to be at the start of given input tensor in batch
+    uint dstIdx = (id_y * *dstStrides++);        // post-increment dstStrides pointer by 1 to exclude outermost batch-dimension stride (for example exclude nStride in an NCDHW tensor)
+    uint srcIdx = (id_y * *srcStrides++);        // post-increment srcStrides pointer by 1 to exclude outermost batch-dimension stride (for example exclude nStride in an NCDHW tensor)
+
+    d_uint8 dstCoords[RPPT_MAX_DIMS], srcIdxs;
+    uint4 idx0123 = make_uint4(id_x, id_x + 1, id_x + 2, id_x + 3);                  // get idx for elements 0, 1, 2, 3 in the 8-element vectorized kernel
+    uint4 idx4567 = make_uint4(id_x + 4, id_x + 5, id_x + 6, id_x + 7);              // get idx for elements 4, 5, 6, 7 in the 8-element vectorized kernel
+    srcIdxs.ui4[0] = srcIdxs.ui4[1] = make_uint4(srcIdx, srcIdx, srcIdx, srcIdx);    // create 8-element vectorized srcIdxs
+
+    // Compute 8 dstCoords given idx0123 and idx4567, corresponding to the 8 srcCoords processed in a thread
+    for (int i = 0; i < tensorDims; i++)
+    {
+        dstCoords[i].ui4[0] = (idx0123 / dstStrides[i]) % dstDims[i];                // transpose 4 srcCoords using idx0123 to 4 dstCoords in dstCoords[i].ui4[0] for the ith tensor dimension
+        dstCoords[i].ui4[1] = (idx4567 / dstStrides[i]) % dstDims[i];                // transpose 4 srcCoords using idx4567 to 4 dstCoords in dstCoords[i].ui4[1] for the ith tensor dimension
+    }
+
+    // Compute corresponding 8 srcIdxs given id_x
+    for (int i = 0; i < tensorDims; i++)
+    {
+        uint4 srcStrides_ui4 = static_cast<uint4>(srcStrides[permTensor[permTensor[i]]]);
+        srcIdxs.ui4[0] += (dstCoords[permTensor[i]].ui4[0] * srcStrides_ui4);        // incrementally adding respective (coordinate value * stride) to get srcIdxs for 0, 1, 2, 3 elements
+        srcIdxs.ui4[1] += (dstCoords[permTensor[i]].ui4[1] * srcStrides_ui4);        // incrementally adding respective (coordinate value * stride) to get srcIdxs for 4, 5, 6, 7 elements
+        dstIdx += (dstCoords[i].ui1[0] * dstStrides[i]);
+    }
+
+    // Move srcIdx to access next input tensor once id_x goes beyond present tensor
+    if((id_x + 8) > maxLength)
+        for(int i = xDiff; i < 8; i++)
+            srcIdxs.ui1[i] += maxLength;
+
+    // Load corresponding 8 src pixels from computed src idx values
+    d_float8 dst_f8;
+    dst_f8.f1[0] = static_cast<float>(srcPtr[srcIdxs.ui1[0]]);
+    dst_f8.f1[1] = static_cast<float>(srcPtr[srcIdxs.ui1[1]]);
+    dst_f8.f1[2] = static_cast<float>(srcPtr[srcIdxs.ui1[2]]);
+    dst_f8.f1[3] = static_cast<float>(srcPtr[srcIdxs.ui1[3]]);
+    dst_f8.f1[4] = static_cast<float>(srcPtr[srcIdxs.ui1[4]]);
+    dst_f8.f1[5] = static_cast<float>(srcPtr[srcIdxs.ui1[5]]);
+    dst_f8.f1[6] = static_cast<float>(srcPtr[srcIdxs.ui1[6]]);
+    dst_f8.f1[7] = static_cast<float>(srcPtr[srcIdxs.ui1[7]]);
+    rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template <typename T>
+RppStatus hip_exec_transpose_tensor(T *srcPtr,
+                                    RpptGenericDescPtr srcGenericDescPtr,
+                                    T *dstPtr,
+                                    RpptGenericDescPtr dstGenericDescPtr,
+                                    Rpp32u *permTensor,
+                                    Rpp32u *roiTensor,
+                                    rpp::Handle& handle)
+{
+    // Check for feasibility of direct copy from input to output if no permutation detected
+    bool copyInput = true;
+    for(int i = 0; i < dstGenericDescPtr->numDims - 1; i++)
+        copyInput *= (permTensor[i] == i);
+
+    if (copyInput)
+    {
+        CHECK_RETURN_STATUS(hipMemcpyAsync(dstPtr, srcPtr, dstGenericDescPtr->dims[0] * dstGenericDescPtr->strides[0] * sizeof(T), hipMemcpyDeviceToDevice, handle.GetStream()));
+    }
+    else
+    {
+        int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3;
+        int globalThreads_y = dstGenericDescPtr->dims[0];
+        int globalThreads_z = 1;
+
+        hipLaunchKernelGGL(transpose_generic_hip_tensor,
+                           dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+                           dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcGenericDescPtr->strides,
+                           dstPtr,
+                           dstGenericDescPtr->strides,
+                           dstGenericDescPtr->dims + 1,
+                           dstGenericDescPtr->numDims - 1,
+                           permTensor);
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp
index 8f88ba90f..bac68a4a1 100644
--- a/src/modules/rppt_tensor_arithmetic_operations.cpp
+++ b/src/modules/rppt_tensor_arithmetic_operations.cpp
@@ -255,6 +255,57 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1,
     return RPP_SUCCESS;
 }
 
+/******************** log ********************/
+
+RppStatus rppt_log_host(RppPtr_t srcPtr,
+                        RpptGenericDescPtr srcGenericDescPtr,
+                        RppPtr_t dstPtr,
+                        RpptGenericDescPtr dstGenericDescPtr,
+                        Rpp32u *roiTensor,
+                        rppHandle_t rppHandle)
+{
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        log_generic_host_tensor(static_cast<Rpp8u *>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                srcGenericDescPtr,
+                                reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                dstGenericDescPtr,
+                                roiTensor,
+                                rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        log_generic_host_tensor(reinterpret_cast<Rpp16f *>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                srcGenericDescPtr,
+                                reinterpret_cast<Rpp16f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                dstGenericDescPtr,
+                                roiTensor,
+                                rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        log_generic_host_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u *>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                srcGenericDescPtr,
+                                reinterpret_cast<Rpp32f *>(static_cast<Rpp8u *>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                dstGenericDescPtr,
+                                roiTensor,
+                                rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        log_generic_host_tensor(static_cast<Rpp8s *>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                srcGenericDescPtr,
+                                reinterpret_cast<Rpp32f *>(static_cast<Rpp8s*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                dstGenericDescPtr,
+                                roiTensor,
+                                rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
 /********************************************************************************************************************/
@@ -454,4 +505,59 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1,
 #endif // backend
 }
 
+/******************** log ********************/
+
+RppStatus rppt_log_gpu(RppPtr_t srcPtr,
+                       RpptGenericDescPtr srcGenericDescPtr,
+                       RppPtr_t dstPtr,
+                       RpptGenericDescPtr dstGenericDescPtr,
+                       Rpp32u *roiTensor,
+                       rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+    else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_log_generic_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                    srcGenericDescPtr,
+                                    reinterpret_cast<Rpp32f *>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                    dstGenericDescPtr,
+                                    roiTensor,
+                                    rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_log_generic_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                    srcGenericDescPtr,
+                                    reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                    dstGenericDescPtr,
+                                    roiTensor,
+                                    rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_log_generic_tensor(reinterpret_cast<Rpp32f *>(static_cast<Rpp8u *>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                    srcGenericDescPtr,
+                                    reinterpret_cast<Rpp32f *>(static_cast<Rpp8u *>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                    dstGenericDescPtr,
+                                    roiTensor,
+                                    rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_log_generic_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                    srcGenericDescPtr,
+                                    reinterpret_cast<Rpp32f *>(static_cast<Rpp8s*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                    dstGenericDescPtr,
+                                    roiTensor,
+                                    rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
 #endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index 0267985e5..c98832f3c 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -22,11 +22,17 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
+#ifdef AUDIO_SUPPORT
+
 #include "rppdefs.h"
 #include "rppi_validate.hpp"
 #include "rppt_tensor_audio_augmentations.h"
 #include "cpu/host_tensor_audio_augmentations.hpp"
 
+#ifdef HIP_COMPILE
+    #include "hip/hip_tensor_audio_augmentations.hpp"
+#endif // HIP_COMPILE
+
 /******************** non_silent_region_detection ********************/
 
 RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr,
@@ -268,3 +274,129 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr,
         return RPP_ERROR_NOT_IMPLEMENTED;
     }
 }
+
+/********************************************************************************************************************/
+/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
+/********************************************************************************************************************/
+
+#ifdef GPU_SUPPORT
+
+/******************** non_silent_region_detection ********************/
+
+RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr,
+                                               RpptDescPtr srcDescPtr,
+                                               Rpp32s *srcLengthTensor,
+                                               Rpp32s *detectedIndexTensor,
+                                               Rpp32s *detectionLengthTensor,
+                                               Rpp32f cutOffDB,
+                                               Rpp32s windowLength,
+                                               Rpp32f referencePower,
+                                               Rpp32s resetInterval,
+                                               rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+
+        return hip_exec_non_silent_region_detection_tensor(static_cast<Rpp32f*>(srcPtr),
+                                                           srcDescPtr,
+                                                           srcLengthTensor,
+                                                           detectedIndexTensor,
+                                                           detectionLengthTensor,
+                                                           cutOffDB,
+                                                           windowLength,
+                                                           referencePower,
+                                                           resetInterval,
+                                                           rpp::deref(rppHandle));
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** to_decibels ********************/
+
+RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr,
+                               RpptDescPtr srcDescPtr,
+                               RppPtr_t dstPtr,
+                               RpptDescPtr dstDescPtr,
+                               RpptImagePatchPtr srcDims,
+                               Rpp32f cutOffDB,
+                               Rpp32f multiplier,
+                               Rpp32f referenceMagnitude,
+                               rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims
+    if (tensorDims != 1 && tensorDims != 2)
+        return RPP_ERROR_INVALID_SRC_DIMS;
+
+    if (!multiplier)
+        return RPP_ERROR_ZERO_DIVISION;
+
+    if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        hip_exec_to_decibels_tensor(static_cast<Rpp32f*>(srcPtr),
+                                    srcDescPtr,
+                                    static_cast<Rpp32f*>(dstPtr),
+                                    dstDescPtr,
+                                    srcDims,
+                                    cutOffDB,
+                                    multiplier,
+                                    referenceMagnitude,
+                                    rpp::deref(rppHandle));
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** down_mixing ********************/
+
+RppStatus rppt_down_mixing_gpu(RppPtr_t srcPtr,
+                               RpptDescPtr srcDescPtr,
+                               RppPtr_t dstPtr,
+                               RpptDescPtr dstDescPtr,
+                               Rpp32s *srcDimsTensor,
+                               bool  normalizeWeights,
+                               rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims
+    if (tensorDims != 1 && tensorDims != 2)
+        return RPP_ERROR_INVALID_SRC_DIMS;
+
+    if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_down_mixing_tensor(static_cast<Rpp32f*>(srcPtr),
+                                    srcDescPtr,
+                                    static_cast<Rpp32f*>(dstPtr),
+                                    dstDescPtr,
+                                    srcDimsTensor,
+                                    normalizeWeights,
+                                    rpp::deref(rppHandle));
+    }
+    else
+    {
+        return RPP_ERROR_NOT_IMPLEMENTED;
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+#endif // GPU_SUPPORT
+#endif // AUDIO_SUPPORT
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_color_augmentations.cpp b/src/modules/rppt_tensor_color_augmentations.cpp
index 3023973fc..e866fe949 100644
--- a/src/modules/rppt_tensor_color_augmentations.cpp
+++ b/src/modules/rppt_tensor_color_augmentations.cpp
@@ -677,7 +677,7 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr,
                                       RpptDescPtr srcDescPtr,
                                       RppPtr_t dstPtr,
                                       RpptDescPtr dstDescPtr,
-                                      Rpp8s *adjustmentValueTensor,
+                                      Rpp32s *adjustmentValueTensor,
                                       RpptROIPtr roiTensorPtrSrc,
                                       RpptRoiType roiType,
                                       rppHandle_t rppHandle)
diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp
index 8829a4ee0..8fc2d00ee 100644
--- a/src/modules/rppt_tensor_effects_augmentations.cpp
+++ b/src/modules/rppt_tensor_effects_augmentations.cpp
@@ -868,6 +868,142 @@ RppStatus rppt_ricap_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** glitch ********************/
+
+RppStatus rppt_glitch_host(RppPtr_t srcPtr,
+                           RpptDescPtr srcDescPtr,
+                           RppPtr_t dstPtr,
+                           RpptDescPtr dstDescPtr,
+                           RpptChannelOffsets *rgbOffsets,
+                           RpptROIPtr roiTensorPtrSrc,
+                           RpptRoiType roiType,
+                           rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        glitch_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                 srcDescPtr,
+                                 static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                 dstDescPtr,
+                                 rgbOffsets,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams,
+                                 rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        glitch_f16_f16_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                   srcDescPtr,
+                                   reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                   dstDescPtr,
+                                   rgbOffsets,
+                                   roiTensorPtrSrc,
+                                   roiType,
+                                   layoutParams,
+                                   rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        glitch_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                   srcDescPtr,
+                                   reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                   dstDescPtr,
+                                   rgbOffsets,
+                                   roiTensorPtrSrc,
+                                   roiType,
+                                   layoutParams,
+                                   rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        glitch_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                 srcDescPtr,
+                                 static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                 dstDescPtr,
+                                 rgbOffsets,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams,
+                                 rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** jitter ********************/
+
+RppStatus rppt_jitter_host(RppPtr_t srcPtr,
+                           RpptDescPtr srcDescPtr,
+                           RppPtr_t dstPtr,
+                           RpptDescPtr dstDescPtr,
+                           Rpp32u *kernelSizeTensor,
+                           Rpp32u seed,
+                           RpptROIPtr roiTensorPtrSrc,
+                           RpptRoiType roiType,
+                           rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+    RpptXorwowStateBoxMuller xorwowInitialState[SIMD_FLOAT_VECTOR_LENGTH];
+    rpp_host_rng_xorwow_f32_initialize_multiseed_stream_boxmuller<SIMD_FLOAT_VECTOR_LENGTH>(xorwowInitialState, seed);
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        jitter_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                 srcDescPtr,
+                                 static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                 dstDescPtr,
+                                 kernelSizeTensor,
+                                 xorwowInitialState,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams,
+                                 rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        jitter_f16_f16_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                   srcDescPtr,
+                                   reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                   dstDescPtr,
+                                   kernelSizeTensor,
+                                   xorwowInitialState,
+                                   roiTensorPtrSrc,
+                                   roiType,
+                                   layoutParams,
+                                   rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        jitter_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                   srcDescPtr,
+                                   reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                   dstDescPtr,
+                                   kernelSizeTensor,
+                                   xorwowInitialState,
+                                   roiTensorPtrSrc,
+                                   roiType,
+                                   layoutParams,
+                                   rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        jitter_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                 srcDescPtr,
+                                 static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                 dstDescPtr,
+                                 kernelSizeTensor,
+                                 xorwowInitialState,
+                                 roiTensorPtrSrc,
+                                 roiType,
+                                 layoutParams,
+                                 rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
 /********************************************************************************************************************/
@@ -1441,6 +1577,8 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1,
 #endif // backend
 }
 
+/******************** water ********************/
+
 RppStatus rppt_water_gpu(RppPtr_t srcPtr,
                          RpptDescPtr srcDescPtr,
                          RppPtr_t dstPtr,
@@ -1511,80 +1649,6 @@ RppStatus rppt_water_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
-/******************** ricap ********************/
-
-RppStatus rppt_ricap_gpu(RppPtr_t srcPtr,
-                         RpptDescPtr srcDescPtr,
-                         RppPtr_t dstPtr,
-                         RpptDescPtr dstDescPtr,
-                         Rpp32u *permutationTensor,
-                         RpptROIPtr roiPtrInputCropRegion,
-                         RpptRoiType roiType,
-                         rppHandle_t rppHandle)
-{
-#ifdef HIP_COMPILE
-    if(srcDescPtr->n == 1) // BatchSize should always be greater than 1
-        return RPP_ERROR;
-    Rpp32u *permutationHipTensor = reinterpret_cast<Rpp32u*>(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
-    CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice));
-
-    if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1)
-    || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1)
-    || (check_roi_out_of_bounds(&roiPtrInputCropRegion[2],srcDescPtr,roiType) == -1)
-    || (check_roi_out_of_bounds(&roiPtrInputCropRegion[3],srcDescPtr,roiType) == -1))
-        return RPP_ERROR_OUT_OF_BOUND_SRC_ROI;
-
-    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
-    {
-        hip_exec_ricap_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
-                              srcDescPtr,
-                              static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
-                              dstDescPtr,
-                              permutationHipTensor,
-                              roiPtrInputCropRegion,
-                              roiType,
-                              rpp::deref(rppHandle));
-    }
-    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
-    {
-        hip_exec_ricap_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
-                              srcDescPtr,
-                              (half*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
-                              dstDescPtr,
-                              permutationHipTensor,
-                              roiPtrInputCropRegion,
-                              roiType,
-                              rpp::deref(rppHandle));
-    }
-    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
-    {
-        hip_exec_ricap_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
-                              srcDescPtr,
-                              (Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
-                              dstDescPtr,
-                              permutationHipTensor,
-                              roiPtrInputCropRegion,
-                              roiType,
-                              rpp::deref(rppHandle));
-    }
-    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
-    {
-        hip_exec_ricap_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
-                              srcDescPtr,
-                              static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
-                              dstDescPtr,
-                              permutationHipTensor,
-                              roiPtrInputCropRegion,
-                              roiType,
-                              rpp::deref(rppHandle));
-    }
-
-    return RPP_SUCCESS;
-#elif defined(OCL_COMPILE)
-    return RPP_ERROR_NOT_IMPLEMENTED;
-#endif // backend
-}
-
 /******************** vignette ********************/
 
 RppStatus rppt_vignette_gpu(RppPtr_t srcPtr,
@@ -1649,6 +1713,8 @@ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
+/******************** erase ********************/
+
 RppStatus rppt_erase_gpu(RppPtr_t srcPtr,
                          RpptDescPtr srcDescPtr,
                          RppPtr_t dstPtr,
@@ -1721,4 +1787,224 @@ RppStatus rppt_erase_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
+/******************** ricap ********************/
+
+RppStatus rppt_ricap_gpu(RppPtr_t srcPtr,
+                         RpptDescPtr srcDescPtr,
+                         RppPtr_t dstPtr,
+                         RpptDescPtr dstDescPtr,
+                         Rpp32u *permutationTensor,
+                         RpptROIPtr roiPtrInputCropRegion,
+                         RpptRoiType roiType,
+                         rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if(srcDescPtr->n == 1) // BatchSize should always be greater than 1
+        return RPP_ERROR;
+    Rpp32u *permutationHipTensor = reinterpret_cast<Rpp32u*>(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+    CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice));
+
+    if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1)
+    || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1)
+    || (check_roi_out_of_bounds(&roiPtrInputCropRegion[2],srcDescPtr,roiType) == -1)
+    || (check_roi_out_of_bounds(&roiPtrInputCropRegion[3],srcDescPtr,roiType) == -1))
+        return RPP_ERROR_OUT_OF_BOUND_SRC_ROI;
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_ricap_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                              dstDescPtr,
+                              permutationHipTensor,
+                              roiPtrInputCropRegion,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_ricap_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                              srcDescPtr,
+                              (half*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                              dstDescPtr,
+                              permutationHipTensor,
+                              roiPtrInputCropRegion,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_ricap_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                              srcDescPtr,
+                              (Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                              dstDescPtr,
+                              permutationHipTensor,
+                              roiPtrInputCropRegion,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_ricap_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                              dstDescPtr,
+                              permutationHipTensor,
+                              roiPtrInputCropRegion,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** glitch ********************/
+
+RppStatus rppt_glitch_gpu(RppPtr_t srcPtr,
+                          RpptDescPtr srcDescPtr,
+                          RppPtr_t dstPtr,
+                          RpptDescPtr dstDescPtr,
+                          RpptChannelOffsets *rgbOffsets,
+                          RpptROIPtr roiTensorPtrSrc,
+                          RpptRoiType roiType,
+                          rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_glitch_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                               srcDescPtr,
+                               static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                               dstDescPtr,
+                               rgbOffsets,
+                               roiTensorPtrSrc,
+                               roiType,
+                               rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_glitch_tensor(reinterpret_cast<half*>((static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes)),
+                               srcDescPtr,
+                               reinterpret_cast<half*>((static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes)),
+                               dstDescPtr,
+                               rgbOffsets,
+                               roiTensorPtrSrc,
+                               roiType,
+                               rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_glitch_tensor(reinterpret_cast<Rpp32f*>((static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes)),
+                               srcDescPtr,
+                               reinterpret_cast<Rpp32f*>((static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes)),
+                               dstDescPtr,
+                               rgbOffsets,
+                               roiTensorPtrSrc,
+                               roiType,
+                               rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_glitch_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                               srcDescPtr,
+                               static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                               dstDescPtr,
+                               rgbOffsets,
+                               roiTensorPtrSrc,
+                               roiType,
+                               rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** jitter ********************/
+
+RppStatus rppt_jitter_gpu(RppPtr_t srcPtr,
+                          RpptDescPtr srcDescPtr,
+                          RppPtr_t dstPtr,
+                          RpptDescPtr dstDescPtr,
+                          Rpp32u *kernelSizeTensor,
+                          Rpp32u seed,
+                          RpptROIPtr roiTensorPtrSrc,
+                          RpptRoiType roiType,
+                          rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+
+    RpptXorwowStateBoxMuller xorwowInitialState;
+    xorwowInitialState.x[0] = 0x75BCD15 + seed;
+    xorwowInitialState.x[1] = 0x159A55E5 + seed;
+    xorwowInitialState.x[2] = 0x1F123BB5 + seed;
+    xorwowInitialState.x[3] = 0x5491333 + seed;
+    xorwowInitialState.x[4] = 0x583F19 + seed;
+    xorwowInitialState.counter = 0x64F0C9 + seed;
+    xorwowInitialState.boxMullerFlag = 0;
+    xorwowInitialState.boxMullerExtra = 0.0f;
+
+    RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr;
+    d_xorwowInitialStatePtr = reinterpret_cast<RpptXorwowStateBoxMuller *>(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+    CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice));
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_jitter_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                               srcDescPtr,
+                               static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                               dstDescPtr,
+                               kernelSizeTensor,
+                               d_xorwowInitialStatePtr,
+                               roiTensorPtrSrc,
+                               roiType,
+                               rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_jitter_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                srcDescPtr,
+                                (half*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                dstDescPtr,
+                                kernelSizeTensor,
+                                d_xorwowInitialStatePtr,
+                                roiTensorPtrSrc,
+                                roiType,
+                                rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_jitter_tensor((Rpp32f*) (static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                srcDescPtr,
+                                (Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                dstDescPtr,
+                                kernelSizeTensor,
+                                d_xorwowInitialStatePtr,
+                                roiTensorPtrSrc,
+                                roiType,
+                                rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_jitter_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                               srcDescPtr,
+                               static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                               dstDescPtr,
+                               kernelSizeTensor,
+                               d_xorwowInitialStatePtr,
+                               roiTensorPtrSrc,
+                               roiType,
+                               rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
 #endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index 6d573ffcc..325881c54 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1099,7 +1099,6 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr,
                           layoutParams,
                           rpp::deref(rppHandle));
     }
-
     return RPP_SUCCESS;
 }
 
@@ -1301,6 +1300,145 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr,
     return RPP_SUCCESS;
 }
 
+/******************** lens_correction ********************/
+
+RppStatus rppt_lens_correction_host(RppPtr_t srcPtr,
+                                    RpptDescPtr srcDescPtr,
+                                    RppPtr_t dstPtr,
+                                    RpptDescPtr dstDescPtr,
+                                    Rpp32f *rowRemapTable,
+                                    Rpp32f *colRemapTable,
+                                    RpptDescPtr tableDescPtr,
+                                    Rpp32f *cameraMatrixTensor,
+                                    Rpp32f *distortionCoeffsTensor,
+                                    RpptROIPtr roiTensorPtrSrc,
+                                    RpptRoiType roiType,
+                                    rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+    compute_lens_correction_remap_tables_host_tensor(srcDescPtr,
+                                                     rowRemapTable,
+                                                     colRemapTable,
+                                                     tableDescPtr,
+                                                     cameraMatrixTensor,
+                                                     distortionCoeffsTensor,
+                                                     roiTensorPtrSrc,
+                                                     rpp::deref(rppHandle));
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        remap_bilinear_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                         srcDescPtr,
+                                         static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                         dstDescPtr,
+                                         rowRemapTable,
+                                         colRemapTable,
+                                         tableDescPtr,
+                                         roiTensorPtrSrc,
+                                         roiType,
+                                         layoutParams,
+                                         rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        remap_bilinear_f16_f16_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                           srcDescPtr,
+                                           reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                           dstDescPtr,
+                                           rowRemapTable,
+                                           colRemapTable,
+                                           tableDescPtr,
+                                           roiTensorPtrSrc,
+                                           roiType,
+                                           layoutParams,
+                                           rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        remap_bilinear_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                                           srcDescPtr,
+                                           reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                                           dstDescPtr,
+                                           rowRemapTable,
+                                           colRemapTable,
+                                           tableDescPtr,
+                                           roiTensorPtrSrc,
+                                           roiType,
+                                           layoutParams,
+                                           rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        remap_bilinear_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                                         srcDescPtr,
+                                         static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                                         dstDescPtr,
+                                         rowRemapTable,
+                                         colRemapTable,
+                                         tableDescPtr,
+                                         roiTensorPtrSrc,
+                                         roiType,
+                                         layoutParams,
+                                         rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
+/******************** transpose ********************/
+
+RppStatus rppt_transpose_host(RppPtr_t srcPtr,
+                              RpptGenericDescPtr srcGenericDescPtr,
+                              RppPtr_t dstPtr,
+                              RpptGenericDescPtr dstGenericDescPtr,
+                              Rpp32u *permTensor,
+                              Rpp32u *roiTensor,
+                              rppHandle_t rppHandle)
+{
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+    {
+        transpose_generic_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                      srcGenericDescPtr,
+                                      static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                      dstGenericDescPtr,
+                                      permTensor,
+                                      roiTensor,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        transpose_generic_host_tensor(reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                      srcGenericDescPtr,
+                                      reinterpret_cast<Rpp16f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                      dstGenericDescPtr,
+                                      permTensor,
+                                      roiTensor,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        transpose_f32_f32_host_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                      srcGenericDescPtr,
+                                      reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                      dstGenericDescPtr,
+                                      permTensor,
+                                      roiTensor,
+                                      rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+    {
+        transpose_generic_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                      srcGenericDescPtr,
+                                      static_cast<Rpp8s*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                      dstGenericDescPtr,
+                                      permTensor,
+                                      roiTensor,
+                                      rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+}
+
 /********************************************************************************************************************/
 /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
 /********************************************************************************************************************/
@@ -2170,6 +2308,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr,
 }
 
 /******************** remap ********************/
+
 RppStatus rppt_remap_gpu(RppPtr_t srcPtr,
                          RpptDescPtr srcDescPtr,
                          RppPtr_t dstPtr,
@@ -2249,4 +2388,150 @@ RppStatus rppt_remap_gpu(RppPtr_t srcPtr,
 #endif // backend
 }
 
+/******************** lens_correction ********************/
+
+RppStatus rppt_lens_correction_gpu(RppPtr_t srcPtr,
+                                   RpptDescPtr srcDescPtr,
+                                   RppPtr_t dstPtr,
+                                   RpptDescPtr dstDescPtr,
+                                   Rpp32f *rowRemapTable,
+                                   Rpp32f *colRemapTable,
+                                   RpptDescPtr tableDescPtr,
+                                   Rpp32f *cameraMatrixTensor,
+                                   Rpp32f *distortionCoeffsTensor,
+                                   RpptROIPtr roiTensorPtrSrc,
+                                   RpptRoiType roiType,
+                                   rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    hip_exec_lens_correction_tensor(dstDescPtr,
+                                    rowRemapTable,
+                                    colRemapTable,
+                                    tableDescPtr,
+                                    cameraMatrixTensor,
+                                    distortionCoeffsTensor,
+                                    roiTensorPtrSrc,
+                                    roiType,
+                                    rpp::deref(rppHandle));
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_remap_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
+                              dstDescPtr,
+                              rowRemapTable,
+                              colRemapTable,
+                              tableDescPtr,
+                              RpptInterpolationType::BILINEAR,
+                              roiTensorPtrSrc,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_remap_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                              srcDescPtr,
+                              reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                              dstDescPtr,
+                              rowRemapTable,
+                              colRemapTable,
+                              tableDescPtr,
+                              RpptInterpolationType::BILINEAR,
+                              roiTensorPtrSrc,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_remap_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offsetInBytes),
+                              srcDescPtr,
+                              reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
+                              dstDescPtr,
+                              rowRemapTable,
+                              colRemapTable,
+                              tableDescPtr,
+                              RpptInterpolationType::BILINEAR,
+                              roiTensorPtrSrc,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_remap_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offsetInBytes,
+                              srcDescPtr,
+                              static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
+                              dstDescPtr,
+                              rowRemapTable,
+                              colRemapTable,
+                              tableDescPtr,
+                              RpptInterpolationType::BILINEAR,
+                              roiTensorPtrSrc,
+                              roiType,
+                              rpp::deref(rppHandle));
+    }
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** transpose ********************/
+
+RppStatus rppt_transpose_gpu(RppPtr_t srcPtr,
+                             RpptGenericDescPtr srcGenericDescPtr,
+                             RppPtr_t dstPtr,
+                             RpptGenericDescPtr dstGenericDescPtr,
+                             Rpp32u *permTensor,
+                             Rpp32u *roiTensor,
+                             rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+    if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+    {
+        hip_exec_transpose_tensor(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                  srcGenericDescPtr,
+                                  static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                  dstGenericDescPtr,
+                                  permTensor,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+    {
+        hip_exec_transpose_tensor(reinterpret_cast<half*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                  srcGenericDescPtr,
+                                  reinterpret_cast<half*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                  dstGenericDescPtr,
+                                  permTensor,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+    {
+        hip_exec_transpose_tensor(reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(srcPtr) + srcGenericDescPtr->offsetInBytes),
+                                  srcGenericDescPtr,
+                                  reinterpret_cast<Rpp32f*>(static_cast<Rpp8u*>(dstPtr) + dstGenericDescPtr->offsetInBytes),
+                                  dstGenericDescPtr,
+                                  permTensor,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+    else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+    {
+        hip_exec_transpose_tensor(static_cast<Rpp8s*>(srcPtr) + srcGenericDescPtr->offsetInBytes,
+                                  srcGenericDescPtr,
+                                  static_cast<Rpp8s*>(dstPtr) + dstGenericDescPtr->offsetInBytes,
+                                  dstGenericDescPtr,
+                                  permTensor,
+                                  roiTensor,
+                                  rpp::deref(rppHandle));
+    }
+
+    return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+    return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
 #endif // GPU_SUPPORT
\ No newline at end of file
diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt
index 77052cabe..23515798b 100644
--- a/utilities/test_suite/CMakeLists.txt
+++ b/utilities/test_suite/CMakeLists.txt
@@ -51,6 +51,7 @@ endif()
 if(NOT RPP_FOUND)
     message("-- ${Yellow}${PROJECT_NAME} requires RPP. Install RPP before running CTests")
 else()
+    # RPP installation - Backend check
     set(RPP_BACKEND_HIP_FOUND 0)
     if(EXISTS ${RPP_INCLUDE_DIR}/rpp_backend.h)
         file(READ ${RPP_INCLUDE_DIR}/rpp_backend.h RPP_BACKEND_FILE)
@@ -62,6 +63,14 @@ else()
     elseif(NOT DEFINED BACKEND)
         set(BACKEND "CPU")
     endif()
+
+    # RPP installation - Audio support check
+    set(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND 0)
+    if(EXISTS ${RPP_INCLUDE_DIR}/rpp_audio_augmentations_support.h)
+        file(READ ${RPP_INCLUDE_DIR}/rpp_audio_augmentations_support.h RPP_AUDIO_AUGMENTATIONS_SUPPORT_FILE)
+        string(REGEX MATCH "RPP_AUDIO_AUGMENTATIONS_SUPPORT ([0-9]*)" _ ${RPP_AUDIO_AUGMENTATIONS_SUPPORT_FILE})
+        set(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND ${CMAKE_MATCH_1})
+    endif()
 endif(NOT RPP_FOUND)
 
 # find required libraries
@@ -87,13 +96,15 @@ if(Python3_FOUND)
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
         )
     endif(NIFTI_FOUND)
-    if(libsnd_LIBS)
-        add_test(
-            NAME rpp_qa_tests_tensor_audio_host_all
-            COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runAudioTests.py --qa_mode 1 --batch_size 3
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-        )
-    endif(libsnd_LIBS)
+    if(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND)
+        if(libsnd_LIBS)
+            add_test(
+                NAME rpp_qa_tests_tensor_audio_host_all
+                COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runAudioTests.py --qa_mode 1 --batch_size 3
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+            )
+        endif(libsnd_LIBS)
+    endif(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND)
 
     if( "${BACKEND}" STREQUAL "HIP")
         # Running all HIP tests
@@ -109,6 +120,15 @@ if(Python3_FOUND)
                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
             )
         endif(NIFTI_FOUND)
+        if(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND)
+            if(libsnd_LIBS)
+                add_test(
+                    NAME rpp_qa_tests_tensor_audio_hip_all
+                    COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runAudioTests.py --qa_mode 1 --batch_size 3
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                )
+            endif(libsnd_LIBS)
+        endif(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND)
 
     elseif( "${BACKEND}" STREQUAL "OCL")
         # TBD: Add OCL Tests
diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt
index a0bd42fa0..814b006fb 100644
--- a/utilities/test_suite/HIP/CMakeLists.txt
+++ b/utilities/test_suite/HIP/CMakeLists.txt
@@ -58,12 +58,25 @@ find_package(hip QUIET)
 find_package(OpenCV QUIET)
 find_package(TurboJpeg QUIET)
 find_package(NIFTI QUIET)
+find_library(libsnd_LIBS
+    NAMES sndfile libsndfile
+    PATHS ${CMAKE_SYSTEM_PREFIX_PATH} ${LIBSND_ROOT_DIR} "/usr/local"
+    PATH_SUFFIXES lib lib64)
 
 # OpenMP
 find_package(OpenMP REQUIRED)
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 
+# Audio Support
+option(RPP_AUDIO_SUPPORT      "Build RPP test suite with audio support"         ON)
+if(RPP_AUDIO_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAUDIO_SUPPORT=1")
+    message("-- ${Green}RPP_AUDIO_SUPPORT set to ON ${ColourReset}")
+else()
+    message("-- ${Yellow}RPP_AUDIO_SUPPORT (default=ON) was overrided. Skipping audio tests.${ColourReset}")
+endif()
+
 if(TurboJpeg_FOUND)
     message("-- ${Green}${PROJECT_NAME} set to build with rpp and TurboJpeg${ColourReset}")
     include_directories(${TurboJpeg_INCLUDE_DIRS})
@@ -102,4 +115,25 @@ if(NIFTI_FOUND AND OpenCV_FOUND)
     target_link_libraries(Tensor_voxel_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device ${NIFTI_PACKAGE_PREFIX}NIFTI::${NIFTI_PACKAGE_PREFIX}niftiio)
 else()
     message("-- ${Yellow}Warning: libniftiio must be installed to install ${PROJECT_NAME}/Tensor_voxel_hip successfully!${ColourReset}")
-endif()
\ No newline at end of file
+endif()
+
+if(RPP_AUDIO_SUPPORT)
+    if(NOT libsnd_LIBS)
+        message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_hip successfully!${ColourReset}")
+    else()
+        message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
+        set(COMPILER_FOR_HIP ${ROCM_PATH}/bin/hipcc)
+        set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
+        include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
+        link_directories(${ROCM_PATH}/lib /usr/local/lib)
+        include_directories(${SndFile_INCLUDE_DIRS})
+        link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
+
+        add_executable(Tensor_audio_hip Tensor_audio_hip.cpp)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
+        if(NOT APPLE)
+            set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
+        endif()
+        target_link_libraries(Tensor_audio_hip ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
+    endif()
+endif()
diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp
new file mode 100644
index 000000000..9d47d8a2c
--- /dev/null
+++ b/utilities/test_suite/HIP/Tensor_audio_hip.cpp
@@ -0,0 +1,299 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "../rpp_test_suite_audio.h"
+
+int main(int argc, char **argv)
+{
+    // handle inputs
+    const int MIN_ARG_COUNT = 7;
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_audio_hip <src folder> <case number = 0:0> <test type 0/1> <numRuns> <batchSize> <dst folder>\n");
+        return -1;
+    }
+
+    char *src = argv[1];
+    int testCase = atoi(argv[2]);
+    int testType = atoi(argv[3]);
+    int numRuns = atoi(argv[4]);
+    int batchSize = atoi(argv[5]);
+    char *dst = argv[6];
+    string scriptPath = argv[7];
+
+    // validation checks
+    if (testType == 0 && batchSize != 3)
+    {
+        cout << "Error! QA Mode only runs with batchsize 3" << endl;
+        return -1;
+    }
+
+    // set case names
+    string funcName = audioAugmentationMap[testCase];
+    if (funcName.empty())
+    {
+        if (testType == 0)
+            printf("\ncase %d is not supported\n", testCase);
+
+        return -1;
+    }
+
+    // initialize tensor descriptors
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // set src/dst data types in tensor descriptors
+    srcDescPtr->dataType = RpptDataType::F32;
+    dstDescPtr->dataType = RpptDataType::F32;
+
+    // other initializations
+    int missingFuncFlag = 0;
+    int maxSrcChannels = 0;
+    int maxSrcWidth = 0, maxSrcHeight = 0;
+    int maxDstWidth = 0, maxDstHeight = 0;
+    Rpp64u iBufferSize = 0;
+    Rpp64u oBufferSize = 0;
+    static int noOfAudioFiles = 0;
+
+    // string ops on function name
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+    string func = funcName;
+
+    // get number of audio files
+    vector<string> audioNames, audioFilesPath;
+    search_files_recursive(src, audioNames, audioFilesPath, ".wav");
+    noOfAudioFiles = audioNames.size();
+    if (noOfAudioFiles < batchSize || ((noOfAudioFiles % batchSize) != 0))
+    {
+        replicate_last_file_to_fill_batch(audioFilesPath[noOfAudioFiles - 1], audioFilesPath, audioNames, audioNames[noOfAudioFiles - 1], noOfAudioFiles, batchSize);
+        noOfAudioFiles = audioNames.size();
+    }
+
+    // find max audio dimensions in the input dataset
+    maxSrcHeight = 1;
+    maxDstHeight = 1;
+    set_audio_max_dimensions(audioFilesPath, maxSrcWidth, maxSrcChannels);
+    maxDstWidth = maxSrcWidth;
+
+    // set numDims, offset, n/c/h/w values for src/dst
+    Rpp32u offsetInBytes = 0;
+    set_audio_descriptor_dims_and_strides(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, maxSrcChannels, offsetInBytes);
+    int maxDstChannels = maxSrcChannels;
+    if(testCase == 3)
+    {
+        srcDescPtr->numDims = 3;
+        maxDstChannels = 1;
+    }
+    set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
+
+    // set buffer sizes for src/dst
+    iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n;
+    oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n;
+
+    // allocate hip buffers for input & output
+    Rpp32f *inputf32 = (Rpp32f *)calloc(iBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+    void *d_inputf32, *d_outputf32;
+    CHECK_RETURN_STATUS(hipMalloc(&d_inputf32, iBufferSize * sizeof(Rpp32f)));
+    CHECK_RETURN_STATUS(hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)));
+
+    // allocate the buffers for audio length and channels
+    Rpp32s *srcLengthTensor, *channelsTensor;
+    CHECK_RETURN_STATUS(hipHostMalloc(&srcLengthTensor, batchSize * sizeof(Rpp32s)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&channelsTensor, batchSize * sizeof(Rpp32s)));
+
+    // allocate the buffers for src/dst dimensions for each element in batch
+    RpptImagePatch *srcDims, *dstDims;
+    CHECK_RETURN_STATUS(hipHostMalloc(&srcDims, batchSize * sizeof(RpptImagePatch)));
+    CHECK_RETURN_STATUS(hipHostMalloc(&dstDims, batchSize * sizeof(RpptImagePatch)));
+
+    // allocate the buffer for srcDimsTensor
+    Rpp32s *srcDimsTensor;
+    if(testCase == 3)
+        CHECK_RETURN_STATUS(hipHostMalloc(&srcDimsTensor, batchSize * 2 * sizeof(Rpp32s)));
+
+    Rpp32s *detectedIndex = nullptr, *detectionLength = nullptr;
+    if(testCase == 0)
+    {
+        CHECK_RETURN_STATUS(hipHostMalloc(&detectedIndex, batchSize * sizeof(Rpp32f)));
+        CHECK_RETURN_STATUS(hipHostMalloc(&detectionLength, batchSize * sizeof(Rpp32f)));
+    }
+
+    // run case-wise RPP API and measure time
+    rppHandle_t handle;
+    hipStream_t stream;
+    CHECK_RETURN_STATUS(hipStreamCreate(&stream));
+    rppCreateWithStreamAndBatchSize(&handle, stream, batchSize);
+
+    int noOfIterations = (int)audioNames.size() / batchSize;
+    double maxWallTime = 0, minWallTime = 500, avgWallTime = 0;
+    string testCaseName;
+    printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
+    for (int iterCount = 0; iterCount < noOfIterations; iterCount++)
+    {
+        // read and decode audio and fill the audio dim values
+        read_audio_batch_and_fill_dims(srcDescPtr, inputf32, audioFilesPath, iterCount, srcLengthTensor, channelsTensor);
+        CHECK_RETURN_STATUS(hipMemcpy(d_inputf32, inputf32, iBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
+        for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++)
+        {
+            double startWallTime, endWallTime;
+            double wallTime;
+            switch (testCase)
+            {
+                case 0:
+                {
+                    testCaseName = "non_silent_region_detection";
+                    Rpp32f cutOffDB = -60.0;
+                    Rpp32s windowLength = 2048;
+                    Rpp32f referencePower = 0.0f;
+                    Rpp32s resetInterval = 8192;
+
+                    startWallTime = omp_get_wtime();
+                    rppt_non_silent_region_detection_gpu(d_inputf32, srcDescPtr, srcLengthTensor, detectedIndex, detectionLength, cutOffDB, windowLength, referencePower, resetInterval, handle);
+
+                    break;
+                }
+                case 1:
+                {
+                    testCaseName = "to_decibels";
+                    Rpp32f cutOffDB = std::log(1e-20);
+                    Rpp32f multiplier = std::log(10);
+                    Rpp32f referenceMagnitude = 1.0f;
+
+                    for (int i = 0; i < batchSize; i++)
+                    {
+                        srcDims[i].height = dstDims[i].height = srcLengthTensor[i];
+                        srcDims[i].width = dstDims[i].width = 1;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    rppt_to_decibels_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, srcDims, cutOffDB, multiplier, referenceMagnitude, handle);
+
+                    break;
+                }
+                case 3:
+                {
+                    testCaseName = "down_mixing";
+                    bool normalizeWeights = false;
+
+                    for (int i = 0, j = 0; i < batchSize; i++, j += 2)
+                    {
+                        srcDimsTensor[j] = srcLengthTensor[i];
+                        srcDimsTensor[j + 1] = channelsTensor[i];
+                        dstDims[i].height = srcLengthTensor[i];
+                        dstDims[i].width = 1;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    rppt_down_mixing_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, srcDimsTensor, normalizeWeights, handle);
+
+                    break;
+                }
+                default:
+                {
+                    missingFuncFlag = 1;
+                    break;
+                }
+            }
+            CHECK_RETURN_STATUS(hipDeviceSynchronize());
+
+            endWallTime = omp_get_wtime();
+            if (missingFuncFlag == 1)
+            {
+                printf("\nThe functionality %s doesn't yet exist in RPP\n", func.c_str());
+                return -1;
+            }
+
+            wallTime = endWallTime - startWallTime;
+            maxWallTime = std::max(maxWallTime, wallTime);
+            minWallTime = std::min(minWallTime, wallTime);
+            avgWallTime += wallTime;
+        }
+
+        // QA mode - verify outputs with golden outputs. Below code doesn’t run for performance tests
+        if (testType == 0)
+        {
+            CHECK_RETURN_STATUS(hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost));
+            CHECK_RETURN_STATUS(hipDeviceSynchronize());
+
+            /* Run only if testCase is not 0
+            For testCase 0 verify_non_silent_region_detection function is used for QA testing */
+            if (testCase != 0)
+                verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HIP");
+            else
+                verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
+
+            /* Dump the outputs to csv files for debugging
+            Runs only if
+            1. DEBUG_MODE is enabled
+            2. Current iteration is 1st iteration
+            3. Test case is not 0 */
+            if (DEBUG_MODE && iterCount == 0 && testCase != 0)
+            {
+                std::ofstream refFile;
+                refFile.open(func + ".csv");
+                for (int i = 0; i < oBufferSize; i++)
+                    refFile << *(outputf32 + i) << "\n";
+                refFile.close();
+            }
+        }
+    }
+    rppDestroyGPU(handle);
+
+    // performance test mode
+    if (testType == 1)
+    {
+        // display measured times
+        maxWallTime *= 1000;
+        minWallTime *= 1000;
+        avgWallTime *= 1000;
+        avgWallTime /= (numRuns * noOfIterations);
+        cout << fixed << "\nmax,min,avg wall times in ms/batch = " << maxWallTime << "," << minWallTime << "," << avgWallTime;
+    }
+
+    cout << endl;
+
+    // free memory
+    free(inputf32);
+    free(outputf32);
+    CHECK_RETURN_STATUS(hipFree(d_inputf32));
+    CHECK_RETURN_STATUS(hipFree(d_outputf32));
+    CHECK_RETURN_STATUS(hipHostFree(srcLengthTensor));
+    CHECK_RETURN_STATUS(hipHostFree(channelsTensor));
+    CHECK_RETURN_STATUS(hipHostFree(srcDims));
+    CHECK_RETURN_STATUS(hipHostFree(dstDims));
+    if(testCase == 3)
+        CHECK_RETURN_STATUS(hipHostFree(srcDimsTensor));
+    if (detectedIndex != nullptr)
+        CHECK_RETURN_STATUS(hipHostFree(detectedIndex));
+    if (detectionLength != nullptr)
+        CHECK_RETURN_STATUS(hipHostFree(detectionLength));
+    return 0;
+}
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 0d8b7fd7c..ec1b47d9b 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -66,7 +66,8 @@ int main(int argc, char **argv)
     bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79);
     bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
     bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
-    bool randomOutputCase = (testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54);
+    bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54);
+    bool nonQACase = (testCase == 24);
     bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
     bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
     bool noiseTypeCase = (testCase == 8);
@@ -104,7 +105,7 @@ int main(int argc, char **argv)
 
     if (layoutType == 2)
     {
-        if(testCase == 36 || testCase == 31 || testCase == 45 || testCase == 86)
+        if(testCase == 36 || testCase == 31 || testCase == 35 || testCase == 45 || testCase == 86)
         {
             printf("\ncase %d does not exist for PLN1 layout\n", testCase);
             return -1;
@@ -366,10 +367,19 @@ int main(int argc, char **argv)
         CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI)));
 
     void *d_rowRemapTable, *d_colRemapTable;
-    if(testCase == 79)
+    if(testCase == 26 || testCase == 79)
     {
         CHECK_RETURN_STATUS(hipMalloc(&d_rowRemapTable, ioBufferSize * sizeof(Rpp32u)));
         CHECK_RETURN_STATUS(hipMalloc(&d_colRemapTable, ioBufferSize * sizeof(Rpp32u)));
+        CHECK_RETURN_STATUS(hipMemset(d_rowRemapTable, 0, ioBufferSize * sizeof(Rpp32u)));
+        CHECK_RETURN_STATUS(hipMemset(d_colRemapTable, 0, ioBufferSize * sizeof(Rpp32u)));
+    }
+
+    Rpp32f *cameraMatrix, *distortionCoeffs;
+    if(testCase == 26)
+    {
+        CHECK_RETURN_STATUS(hipHostMalloc(&cameraMatrix, batchSize * 9 * sizeof(Rpp32f)));
+        CHECK_RETURN_STATUS(hipHostMalloc(&distortionCoeffs, batchSize * 8 * sizeof(Rpp32f)));
     }
 
     Rpp32u boxesInEachImage = 3;
@@ -397,6 +407,14 @@ int main(int argc, char **argv)
     if(testCase == 46)
         CHECK_RETURN_STATUS(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f)));
 
+    Rpp32u *kernelSizeTensor;
+    if(testCase == 6)
+        CHECK_RETURN_STATUS(hipHostMalloc(&kernelSizeTensor, batchSize * sizeof(Rpp32u)));
+
+    RpptChannelOffsets *rgbOffsets;
+    if(testCase == 35)
+        CHECK_RETURN_STATUS(hipHostMalloc(&rgbOffsets, batchSize * sizeof(RpptChannelOffsets)));
+
     // case-wise RPP API and measure time script for Unit and Performance test
     printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
     for(int iterCount = 0; iterCount < noOfIterations; iterCount++)
@@ -548,6 +566,22 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 6:
+                {
+                    testCaseName = "jitter";
+
+                    Rpp32u seed = 1255459;
+                    for (i = 0; i < batchSize; i++)
+                        kernelSizeTensor[i] = 5;
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_jitter_gpu(d_input, srcDescPtr, d_output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 8:
                 {
                     testCaseName = "noise";
@@ -696,6 +730,52 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 24:
+                {
+                    testCaseName = "warp_affine";
+
+                    if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR))
+                    {
+                        missingFuncFlag = 1;
+                        break;
+                    }
+
+                    Rpp32f6 affineTensor_f6[batchSize];
+                    Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6;
+                    for (i = 0; i < batchSize; i++)
+                    {
+                        affineTensor_f6[i].data[0] = 1.23;
+                        affineTensor_f6[i].data[1] = 0.5;
+                        affineTensor_f6[i].data[2] = 0;
+                        affineTensor_f6[i].data[3] = -0.8;
+                        affineTensor_f6[i].data[4] = 0.83;
+                        affineTensor_f6[i].data[5] = 0;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_warp_affine_gpu(d_input, srcDescPtr, d_output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 26:
+                {
+                    testCaseName = "lens_correction";
+
+                    RpptDesc tableDesc = srcDesc;
+                    RpptDescPtr tableDescPtr = &tableDesc;
+                    init_lens_correction(batchSize, srcDescPtr, cameraMatrix, distortionCoeffs, tableDescPtr);
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_lens_correction_gpu(d_input, srcDescPtr, d_output, dstDescPtr, static_cast<Rpp32f *>(d_rowRemapTable), static_cast<Rpp32f *>(d_colRemapTable), tableDescPtr, cameraMatrix, distortionCoeffs, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 29:
                 {
                     testCaseName = "water";
@@ -836,6 +916,28 @@ int main(int argc, char **argv)
 
                     CHECK_RETURN_STATUS(hipHostFree(lutBuffer));
                 }
+                case 35:
+                {
+                    testCaseName = "glitch";
+
+                    for (i = 0; i < batchSize; i++)
+                    {
+                        rgbOffsets[i].r.x = 10;
+                        rgbOffsets[i].r.y = 10;
+                        rgbOffsets[i].g.x = 0;
+                        rgbOffsets[i].g.y = 0;
+                        rgbOffsets[i].b.x = 5;
+                        rgbOffsets[i].b.y = 5;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_glitch_gpu(d_input, srcDescPtr, d_output, dstDescPtr, rgbOffsets, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 36:
                 {
                     testCaseName = "color_twist";
@@ -1397,7 +1499,7 @@ int main(int argc, char **argv)
                 1.QA Flag is set
                 2.input bit depth 0 (U8)
                 3.source and destination layout are the same*/
-                if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
+                if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase))
                 {
                     if (testCase == 87)
                         compare_reduction_output(static_cast<uint64_t *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
@@ -1465,7 +1567,7 @@ int main(int argc, char **argv)
                 2.input bit depth 0 (Input U8 && Output U8)
                 3.source and destination layout are the same
                 4.augmentation case does not generate random output*/
-                if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase))
+                if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase))
                     compare_output<Rpp8u>(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath);
 
                 // Calculate exact dstROI in XYWH format for OpenCV dump
@@ -1520,6 +1622,20 @@ int main(int argc, char **argv)
         CHECK_RETURN_STATUS(hipHostFree(cropRoi));
         CHECK_RETURN_STATUS(hipHostFree(patchRoi));
     }
+    if(testCase == 26)
+    {
+        CHECK_RETURN_STATUS(hipHostFree(cameraMatrix));
+        CHECK_RETURN_STATUS(hipHostFree(distortionCoeffs));
+    }
+    if(testCase == 79)
+    {
+        free(rowRemapTable);
+        free(colRemapTable);
+        CHECK_RETURN_STATUS(hipFree(d_rowRemapTable));
+        CHECK_RETURN_STATUS(hipFree(d_colRemapTable));
+    }
+    if(testCase == 35)
+        CHECK_RETURN_STATUS(hipHostFree(rgbOffsets));
     if (reductionTypeCase)
     {
         CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr));
@@ -1538,19 +1654,14 @@ int main(int argc, char **argv)
         CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
     if(roiTensor != NULL)
         CHECK_RETURN_STATUS(hipHostFree(roiTensor));
+    if(testCase == 6)
+        CHECK_RETURN_STATUS(hipHostFree(kernelSizeTensor));
     free(input);
     free(input_second);
     free(output);
     free(inputu8);
     free(inputu8Second);
     free(outputu8);
-    if(testCase == 79)
-    {
-        free(rowRemapTable);
-        free(colRemapTable);
-        CHECK_RETURN_STATUS(hipFree(d_rowRemapTable));
-        CHECK_RETURN_STATUS(hipFree(d_colRemapTable));
-    }
     CHECK_RETURN_STATUS(hipFree(d_input));
     if(dualInputCase)
         CHECK_RETURN_STATUS(hipFree(d_input_second));
diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
index 96197f432..cb0d53b34 100644
--- a/utilities/test_suite/HIP/Tensor_misc_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv)
     if (argc < MIN_ARG_COUNT)
     {
         printf("\nImproper Usage! Needs all arguments!\n");
-        printf("\nUsage: ./Tensor_misc_hip <case number = 0:0> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <dst path> <script path>\n");
+        printf("\nUsage: ./Tensor_misc_hip <case number = 0:2> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <additional param> <dst path> <script path>\n");
         return -1;
     }
     Rpp32u testCase, testType, nDim, batchSize, numRuns, toggle;
@@ -47,7 +47,9 @@ int main(int argc, char **argv)
     string scriptPath = argv[9];
     qaMode = (testType == 0);
     bool axisMaskCase = (testCase == 1);
-    int axisMask = (axisMaskCase) ? atoi(argv[7]) : 1;
+    bool permOrderCase = (testCase == 0);
+    int additionalParam = (axisMaskCase || permOrderCase) ? atoi(argv[7]) : 1;
+    int axisMask = additionalParam, permOrder = additionalParam;
 
     if (qaMode && batchSize != 3)
     {
@@ -70,6 +72,13 @@ int main(int argc, char **argv)
         func += "_" + std::to_string(nDim) + "d" + "_axisMask";
         func += additionalParam_char;
     }
+    if (permOrderCase)
+    {
+        char additionalParam_char[2];
+        std::sprintf(additionalParam_char, "%d", permOrder);
+        func += "_" + std::to_string(nDim) + "d" + "_permOrder";
+        func += additionalParam_char;
+    }
 
     // fill roi based on mode and number of dimensions
     Rpp32u *roiTensor;
@@ -102,7 +111,7 @@ int main(int argc, char **argv)
 
     // read input data
     if(qaMode)
-        read_data(inputF32, nDim, 0, scriptPath);
+        read_data(inputF32, nDim, 0, scriptPath, funcName);
     else
     {
         std::srand(0);
@@ -114,6 +123,10 @@ int main(int argc, char **argv)
     CHECK_RETURN_STATUS(hipMemcpy(d_inputF32, inputF32, bufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice));
     CHECK_RETURN_STATUS(hipDeviceSynchronize());
 
+    Rpp32u *permTensor = nullptr;
+    if (testCase == 0)
+        CHECK_RETURN_STATUS(hipHostMalloc(&permTensor, nDim * sizeof(Rpp32u)));
+
     rppHandle_t handle;
     hipStream_t stream;
     CHECK_RETURN_STATUS(hipStreamCreate(&stream));
@@ -121,8 +134,11 @@ int main(int argc, char **argv)
 
     Rpp32f *meanTensor = nullptr, *stdDevTensor = nullptr;
     Rpp32f *meanTensorCPU = nullptr, *stdDevTensorCPU = nullptr;
+    bool externalMeanStd = true;
+
     double startWallTime, endWallTime;
     double maxWallTime = 0, minWallTime = 500, avgWallTime = 0, wallTime = 0;
+    string testCaseName;
 
     // case-wise RPP API and measure time script for Unit and Performance test
     printf("\nRunning %s %d times (each time with a batch size of %d) and computing mean statistics...", func.c_str(), numRuns, batchSize);
@@ -130,14 +146,30 @@ int main(int argc, char **argv)
     {
         switch(testCase)
         {
+            case 0:
+            {
+                testCaseName  = "transpose";
+                fill_perm_values(nDim, permTensor, qaMode, permOrder);
+
+                for(int i = 1; i <= nDim; i++)
+                    dstDescriptorPtrND->dims[i] = roiTensor[nDim + permTensor[i - 1]];
+                compute_strides(dstDescriptorPtrND);
+
+                startWallTime = omp_get_wtime();
+                rppt_transpose_gpu(d_inputF32, srcDescriptorPtrND, d_outputF32, dstDescriptorPtrND, permTensor, roiTensor, handle);
+
+                break;
+            }
             case 1:
             {
+                testCaseName  = "normalize";
                 float scale = 1.0;
                 float shift = 0.0;
 
                 // computeMeanStddev set to 3 means both mean and stddev should be computed internally.
                 // Wherein 0th bit used to represent computeMean and 1st bit for computeStddev.
                 Rpp8u computeMeanStddev = 3;
+                externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
 
                 Rpp32u size = 1; // length of mean and stddev tensors differ based on axisMask and nDim
                 Rpp32u maxSize = 1;
@@ -171,15 +203,15 @@ int main(int argc, char **argv)
                 startWallTime = omp_get_wtime();
                 rppt_normalize_gpu(d_inputF32, srcDescriptorPtrND, d_outputF32, dstDescriptorPtrND, axisMask, meanTensor, stdDevTensor, computeMeanStddev, scale, shift, roiTensor, handle);
 
-                // compare outputs if qaMode is true
-                if(qaMode)
-                {
-                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
-                    CHECK_RETURN_STATUS(hipMemcpy(outputF32, d_outputF32, bufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost));
-                    CHECK_RETURN_STATUS(hipDeviceSynchronize());
-                    bool externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
-                    compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, axisMask, scriptPath, externalMeanStd);
-                }
+                break;
+            }
+            case 2:
+            {
+                testCaseName  = "log";
+
+                startWallTime = omp_get_wtime();
+                rppt_log_gpu(d_inputF32, srcDescriptorPtrND, d_outputF32, dstDescriptorPtrND, roiTensor, handle);
+
                 break;
             }
             default:
@@ -188,20 +220,25 @@ int main(int argc, char **argv)
                 exit(0);
             }
         }
-        if(!qaMode)
-        {
-            CHECK_RETURN_STATUS(hipDeviceSynchronize());
-            endWallTime = omp_get_wtime();
+        CHECK_RETURN_STATUS(hipDeviceSynchronize());
+        endWallTime = omp_get_wtime();
 
-            wallTime = endWallTime - startWallTime;
-            maxWallTime = std::max(maxWallTime, wallTime);
-            minWallTime = std::min(minWallTime, wallTime);
-            avgWallTime += wallTime;
-        }
+        wallTime = endWallTime - startWallTime;
+        maxWallTime = std::max(maxWallTime, wallTime);
+        minWallTime = std::min(minWallTime, wallTime);
+        avgWallTime += wallTime;
     }
     rppDestroyGPU(handle);
 
-    if(!qaMode)
+    // compare outputs if qaMode is true
+    if(qaMode)
+    {
+        CHECK_RETURN_STATUS(hipDeviceSynchronize());
+        CHECK_RETURN_STATUS(hipMemcpy(outputF32, d_outputF32, bufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost));
+        CHECK_RETURN_STATUS(hipDeviceSynchronize());
+        compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, testCaseName, additionalParam, scriptPath, externalMeanStd);
+    }
+    else
     {
         maxWallTime *= 1000;
         minWallTime *= 1000;
@@ -222,6 +259,8 @@ int main(int argc, char **argv)
         CHECK_RETURN_STATUS(hipFree(meanTensor));
     if(stdDevTensor != nullptr)
         CHECK_RETURN_STATUS(hipFree(stdDevTensor));
+    if (permTensor != nullptr)
+        CHECK_RETURN_STATUS(hipHostFree(permTensor));
     if(meanTensorCPU != nullptr)
         free(meanTensorCPU);
     if(stdDevTensorCPU != nullptr)
diff --git a/utilities/test_suite/HIP/runAudioTests.py b/utilities/test_suite/HIP/runAudioTests.py
new file mode 100644
index 000000000..dc279343d
--- /dev/null
+++ b/utilities/test_suite/HIP/runAudioTests.py
@@ -0,0 +1,295 @@
+"""
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import sys
+sys.dont_write_bytecode = True
+sys.path.append(os.path.join(os.path.dirname( __file__ ), '..' ))
+from common import *
+
+# Set the timestamp
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+scriptPath = os.path.dirname(os.path.realpath(__file__))
+inFilePath = scriptPath + "/../TEST_AUDIO_FILES/three_samples_single_channel_src1"
+outFolderPath = os.getcwd()
+buildFolderPath = os.getcwd()
+caseMin = 0
+caseMax = 3
+
+
+# Get a list of log files based on a flag for preserving output
+def get_log_file_list():
+    return [
+        outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HIP_" + timestamp + "/Tensor_audio_hip_raw_performance_log.txt",
+    ]
+
+def case_file_check(CASE_FILE_PATH, new_file):
+    try:
+        case_file = open(CASE_FILE_PATH,'r')
+        for line in case_file:
+            print(line)
+            if not(line.startswith('"Name"')):
+                new_file.write(line)
+        case_file.close()
+        return True
+    except IOError:
+        print("Unable to open case results")
+        return False
+
+# Generate performance reports based on counters and a list of types
+def generate_performance_reports(RESULTS_DIR):
+    import pandas as pd
+    pd.options.display.max_rows = None
+    # Generate performance report
+    df = pd.read_csv(RESULTS_DIR + "/consolidated_results.stats.csv")
+    df["AverageMs"] = df["AverageNs"] / 1000000
+    dfPrint = df.drop(['Percentage'], axis = 1)
+    dfPrint["HIP Kernel Name"] = dfPrint.iloc[:,0].str.lstrip("Hip_")
+    dfPrint_noIndices = dfPrint.astype(str)
+    dfPrint_noIndices.replace(['0', '0.0'], '', inplace = True)
+    dfPrint_noIndices = dfPrint_noIndices.to_string(index = False)
+    print(dfPrint_noIndices)
+
+def run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath):
+    print("./Tensor_audio_hip " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize))
+    result = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_hip", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    stdout_data, stderr_data = result.communicate()
+    print(stdout_data.decode())
+
+def run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
+    with open(loggingFolder + "/Tensor_audio_hip_raw_performance_log.txt", "a") as logFile:
+        print("./Tensor_audio_hip " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize))
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_hip", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
+        read_from_subprocess_and_write_to_log(process, logFile)
+        print("------------------------------------------------------------------------------------------")
+
+def run_performance_test_with_profiler_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
+    if not os.path.isdir(outFilePath + "/case_" + case):
+        os.mkdir(outFilePath + "/case_" + case)
+    with open(loggingFolder + "/Tensor_audio_hip_raw_performance_log.txt", "a") as logFile:
+        print("\nrocprof --basenames on --timestamp on --stats -o " + outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_audio_hip " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize))
+        process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', outFilePath + "/case_" + case + "/output_case" + case + ".csv", "./Tensor_audio_hip", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)  # nosec
+        while True:
+            output = process.stdout.readline()
+            if not output and process.poll() is not None:
+                break
+            print(output.strip())
+            output_str = output.decode('utf-8')
+            logFile.write(output_str)
+        print("------------------------------------------------------------------------------------------")
+
+def run_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath, profilingOption = "NO"):
+    print("\n\n\n\n")
+    print("--------------------------------")
+    print("Running a New Functionality...")
+    print("--------------------------------")
+    if testType == 0:
+        run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath)
+    elif testType == 1 and profilingOption == "NO":
+        run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
+    else:
+        run_performance_test_with_profiler_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath)
+
+# Parse and validate command-line arguments for the RPP test suite
+def rpp_test_suite_parser_and_validator():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_path", type = str, default = inFilePath, help = "Path to the input folder")
+    parser.add_argument("--case_start", type = int, default = caseMin, help = "Testing start case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument("--case_end", type = int, default = caseMax, help = "Testing end case # - Range must be in [" + str(caseMin) + ":" + str(caseMax) + "]")
+    parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = QA tests / 1 = Performance tests)")
+    parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output audio data from tests will be compared with golden outputs - (0 / 1)", required = False)
+    parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to test", required = False)
+    parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+    parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
+    parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )")
+    parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    args = parser.parse_args()
+
+    # check if the folder exists
+    validate_path(args.input_path)
+
+    # validate the parameters passed by user
+    if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
+        print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
+        exit(0)
+    elif args.case_end < args.case_start:
+        print("Ending case# must be greater than starting case#. Aborting!")
+        exit(0)
+    elif args.test_type < 0 or args.test_type > 1:
+        print("Test Type# must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.case_list is not None and args.case_start != caseMin and args.case_end != caseMax:
+        print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
+        exit(0)
+    elif args.qa_mode < 0 or args.qa_mode > 1:
+        print("QA mode must be in the 0 / 1. Aborting!")
+        exit(0)
+    elif args.num_runs <= 0:
+        print("Number of Runs must be greater than 0. Aborting!")
+        exit(0)
+    elif args.batch_size <= 0:
+        print("Batch size must be greater than 0. Aborting!")
+        exit(0)
+    elif args.profiling != 'YES' and args.profiling != 'NO':
+        print("Profiling option value must be either 'YES' or 'NO'.")
+        exit(0)
+    elif args.preserve_output < 0 or args.preserve_output > 1:
+        print("Preserve Output must be in the 0/1 (0 = override / 1 = preserve). Aborting")
+        exit(0)
+    elif args.test_type == 0 and args.input_path != inFilePath:
+        print("Invalid input path! QA mode can run only with path:", inFilePath)
+        exit(0)
+
+    if args.case_list is None:
+        args.case_list = range(args.case_start, args.case_end + 1)
+        args.case_list = [str(x) for x in args.case_list]
+    else:
+        for case in args.case_list:
+            if int(case) < caseMin or int(case) > caseMax:
+                print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
+                exit(0)
+    return args
+
+args = rpp_test_suite_parser_and_validator()
+srcPath = args.input_path
+caseStart = args.case_start
+caseEnd = args.case_end
+testType = args.test_type
+caseList = args.case_list
+qaMode = args.qa_mode
+profilingOption = args.profiling
+numRuns = args.num_runs
+preserveOutput = args.preserve_output
+batchSize = args.batch_size
+outFilePath = " "
+
+# Override testType to 0 if testType is 1 and qaMode is 1
+if testType == 1 and qaMode == 1:
+    print("WARNING: QA Mode cannot be run with testType = 1 (performance tests). Resetting testType to 0")
+    testType = 0
+
+# set the output folders and number of runs based on type of test (unit test / performance test)
+if(testType == 0):
+    outFilePath = outFolderPath + "/QA_RESULTS_AUDIO_HIP_" + timestamp
+    numRuns = 1
+elif(testType == 1):
+    if "--num_runs" not in sys.argv:
+        numRuns = 100   #default numRuns for running performance tests
+    outFilePath = outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HIP_" + timestamp
+else:
+    print("Invalid TEST_TYPE specified. TEST_TYPE should be 0/1 (0 = QA tests / 1 = Performance tests)")
+    exit(0)
+
+if preserveOutput == 0:
+    validate_and_remove_folders(outFolderPath, "QA_RESULTS_AUDIO_HIP")
+    validate_and_remove_folders(outFolderPath, "OUTPUT_PERFORMANCE_AUDIO_LOGS_HIP")
+
+os.mkdir(outFilePath)
+loggingFolder = outFilePath
+dstPath = outFilePath
+
+# Validate DST_FOLDER
+validate_and_remove_files(dstPath)
+
+# Enable extglob
+if os.path.exists(buildFolderPath + "/build"):
+    shutil.rmtree(buildFolderPath + "/build")
+os.makedirs(buildFolderPath + "/build")
+os.chdir(buildFolderPath + "/build")
+
+# Run cmake and make commands
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")    # nosec
+
+# List of cases supported
+supportedCaseList = ['0', '1', '3']
+if qaMode and batchSize != 3:
+    print("QA tests can only run with a batch size of 3.")
+    exit(0)
+
+for case in caseList:
+    if "--input_path" not in sys.argv:
+        if case == "3":
+            srcPath = scriptPath + "/../TEST_AUDIO_FILES/three_sample_multi_channel_src1"
+        else:
+            srcPath = inFilePath
+
+    if case not in supportedCaseList:
+        continue
+    run_test(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath, profilingOption)
+
+# print the results of qa tests
+nonQACaseList = [] # Add cases present in supportedCaseList, but without QA support
+if testType == 0:
+    qaFilePath = os.path.join(outFilePath, "QA_results.txt")
+    checkFile = os.path.isfile(qaFilePath)
+    if checkFile:
+        print("---------------------------------- Results of QA Test - Tensor_audio_hip -----------------------------------\n")
+        print_qa_tests_summary(qaFilePath, supportedCaseList, nonQACaseList)
+
+# Performance tests
+if testType == 1 and profilingOption == "NO":
+    logFileList = get_log_file_list()
+    for logFile in logFileList:
+        print_performance_tests_summary(logFile, "", numRuns)
+elif testType == 1 and profilingOption == "YES":
+    RESULTS_DIR = outFolderPath + "/OUTPUT_PERFORMANCE_AUDIO_LOGS_HIP_" + timestamp
+    print("RESULTS_DIR = " + RESULTS_DIR)
+    CONSOLIDATED_FILE = RESULTS_DIR + "/consolidated_results.stats.csv"
+
+    CASE_NUM_LIST = caseList
+    BIT_DEPTH_LIST = [2]
+    OFT_LIST = [0]
+
+    # Open csv file
+    new_file = open(CONSOLIDATED_FILE, 'w')
+    new_file.write('"HIP Kernel Name","Calls","TotalDurationNs","AverageNs","Percentage"\n')
+
+    # Loop through cases
+    for CASE_NUM in CASE_NUM_LIST:
+        # Set results directory
+        CASE_RESULTS_DIR = RESULTS_DIR + "/case_" + str(CASE_NUM)
+        print("CASE_RESULTS_DIR = " + CASE_RESULTS_DIR)
+
+        # Loop through bit depths
+        for BIT_DEPTH in BIT_DEPTH_LIST:
+            # Loop through output format toggle cases
+            for OFT in OFT_LIST:
+                # Write into csv file
+                CASE_FILE_PATH = CASE_RESULTS_DIR + "/output_case" + str(CASE_NUM) + ".stats.csv"
+                print("CASE_FILE_PATH = " + CASE_FILE_PATH)
+                fileCheck = case_file_check(CASE_FILE_PATH, new_file)
+                if fileCheck == False:
+                    continue
+
+    new_file.close()
+    subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), CONSOLIDATED_FILE])  # nosec
+    try:
+        generate_performance_reports(RESULTS_DIR)
+    except ImportError:
+        print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + \
+                CONSOLIDATED_FILE + "\n")
+    except IOError:
+        print("Unable to open results in " + CONSOLIDATED_FILE)
diff --git a/utilities/test_suite/HIP/runMiscTests.py b/utilities/test_suite/HIP/runMiscTests.py
index bbe037b06..ee97f4547 100644
--- a/utilities/test_suite/HIP/runMiscTests.py
+++ b/utilities/test_suite/HIP/runMiscTests.py
@@ -37,8 +37,8 @@
 scriptPath = os.path.dirname(os.path.realpath(__file__))
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
-caseMin = 1
-caseMax = 1
+caseMin = 0
+caseMax = 2
 
 # Get a list of log files based on a flag for preserving output
 def get_log_file_list():
@@ -74,24 +74,25 @@ def generate_performance_reports(RESULTS_DIR):
     print(dfPrint_noIndices)
 
 def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
-    print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
-    print(result.stdout.decode())
+    print("./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg))
+    result = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    stdout_data, stderr_data = result.communicate()
+    print(stdout_data.decode())
     print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
-    with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
-        print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+    with open(loggingFolder + "/Tensor_misc_hip_raw_performance_log.txt", "a") as logFile:
+        logFile.write("./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
         read_from_subprocess_and_write_to_log(process, logFile)
 
 def run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
-    if not os.path.exists(f"{outFilePath}/case_{case}"):
-        os.mkdir(f"{outFilePath}/case_{case}")
+    if not os.path.exists(outFilePath + "/case_" + str(case)):
+        os.mkdir(outFilePath + "/case_" + str(case))
 
-    with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
-        print(f"\nrocprof --basenames on --timestamp on --stats -o {outFilePath}/case_{case}/output_case{case}.csv ./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
-        process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{outFilePath}/case_{case}/output_case{case}.csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+    with open(loggingFolder + "/Tensor_misc_hip_raw_performance_log.txt", "a") as logFile:
+        logFile.write("\nrocprof --basenames on --timestamp on --stats -o " + outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n")
+        process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)  # nosec
         read_from_subprocess_and_write_to_log(process, logFile)
     print("------------------------------------------------------------------------------------------")
 
@@ -121,6 +122,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Outputs from tests will be compared with golden outputs - (0 / 1)", required = False)
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
+    print_case_list(miscAugmentationMap, "HIP", parser)
     args = parser.parse_args()
 
     # validate the parameters passed by user
@@ -205,14 +207,17 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")    # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")    # nosec
 
-supportedCaseList = ['1']
+supportedCaseList = ['0', '1', '2']
 for case in caseList:
     if case not in supportedCaseList:
         continue
-    if case == "1":
+    if case == "0":
+        for transposeOrder in range(1, numDims):
+            run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, transposeOrder, profilingOption)
+    elif case == "1":
         for axisMask in range(1, pow(2, numDims)):
             run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask, profilingOption)
     else:
@@ -249,7 +254,7 @@ def rpp_test_suite_parser_and_validator():
                     continue
 
     new_file.close()
-    subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), CONSOLIDATED_FILE])  # nosec
+    subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), CONSOLIDATED_FILE])  # nosec
     try:
         generate_performance_reports(RESULTS_DIR)
     except ImportError:
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 40dfb21ea..cb4bc8bda 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -35,6 +35,7 @@
 inFilePath1 = scriptPath + "/../TEST_IMAGES/three_images_mixed_src1"
 inFilePath2 = scriptPath + "/../TEST_IMAGES/three_images_mixed_src2"
 ricapInFilePath = scriptPath + "/../TEST_IMAGES/three_images_150x150_src1"
+lensCorrectionInFilePath = scriptPath + "/../TEST_IMAGES/lens_distortion"
 qaInputFile = scriptPath + "/../TEST_IMAGES/three_images_mixed_src1"
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
@@ -49,31 +50,6 @@ def get_log_file_list(preserveOutput):
         outFolderPath + "/OUTPUT_PERFORMANCE_LOGS_HIP_" + timestamp + "/Tensor_hip_pln1_raw_performance_log.txt"
     ]
 
-# Functionality group finder
-def func_group_finder(case_number):
-    if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 45:
-        return "color_augmentations"
-    elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84:
-        return "effects_augmentations"
-    elif case_number < 40 or case_number == 63:
-        return "geometric_augmentations"
-    elif case_number < 42:
-        return "morphological_operations"
-    elif case_number == 49 or case_number == 54:
-        return "filter_augmentations"
-    elif case_number < 40:
-        return "geometric_augmentations"
-    elif case_number < 62:
-        return "arithmetic_operations"
-    elif case_number < 69:
-        return "logical_operations"
-    elif case_number < 87:
-        return "data_exchange_operations"
-    elif case_number < 92:
-        return "statistical_operations"
-    else:
-        return "miscellaneous"
-
 def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     print("\n\n\n\n")
     print("--------------------------------")
@@ -94,35 +70,39 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
 
             if case == "40" or case == "41" or case == "49" or case == "54":
                 for kernelSize in range(3, 10, 2):
-                    print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {kernelSize}")
-                    result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(kernelSize), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                    print(result.stdout.decode())
+                    print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(kernelSize))
+                    result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(kernelSize), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
+                    stdout_data, stderr_data = result.communicate()
+                    print(stdout_data.decode())
             elif case == "8":
                 # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                 for noiseType in range(3):
-                    print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {noiseType} ")
-                    result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                    print(result.stdout.decode())
+                    print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(noiseType))
+                    result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
+                    stdout_data, stderr_data = result.communicate()
+                    print(stdout_data.decode())
             elif case == "21" or case == "23" or case == "24" or case == "79":
                 # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                 interpolationRange = 6
                 if case =='79':
                     interpolationRange = 2
                 for interpolationType in range(interpolationRange):
-                    print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {interpolationType}")
-                    result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                    print(result.stdout.decode())
+                    print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(interpolationType))
+                    result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
+                    stdout_data, stderr_data = result.communicate()
+                    print(stdout_data.decode())
             else:
-                print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} 0 {numRuns} {testType} {layout}")
-                result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                print(result.stdout.decode())
+                print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " 0 " + str(numRuns) + " " + str(testType) + " " + str(layout))
+                result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
+                stdout_data, stderr_data = result.communicate()
+                print(stdout_data.decode())
 
             print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
-    with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
-        print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)   # nosec
+    with open(loggingFolder + "/Tensor_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+        print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam))
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)  # nosec
         read_from_subprocess_and_write_to_log(process, logFile)
 
 def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
@@ -157,11 +137,11 @@ def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPa
 def run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     addtionalParamString = additionalParamType + str(additionalParam)
     layoutName = get_layout_name(layout)
-    if not os.path.isdir(f"{dstPath}/Tensor_{layoutName}/case_{case}"):
-        os.mkdir(f"{dstPath}/Tensor_{layoutName}/case_{case}")
-    with open(f"{loggingFolder}/Tensor_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
-        print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
-        process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)   # nosec
+    if not os.path.isdir(dstPath + "/Tensor_" + layoutName + "/case_" + str(case)):
+        os.makedirs(dstPath + "/Tensor_" + layoutName + "/case_" + str(case))
+    with open(loggingFolder + "/Tensor_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: 
+        logFile.write("rocprof --basenames on --timestamp on --stats -o " + dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + "_bitDepth" + str(bitDepth) + "_oft" + addtionalParamString + ".csv ./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam) + " 0\n")
+        process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + "_bitDepth" + str(bitDepth) + "_oft" + addtionalParamString + ".csv", buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)  # nosec
         while True:
             output = process.stdout.readline()
             if not output and process.poll() is not None:
@@ -186,6 +166,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
     parser.add_argument('--roi', nargs = 4, help = "specifies the roi values", required = False)
+    print_case_list(imageAugmentationMap, "HIP", parser)
     args = parser.parse_args()
 
     # check if the folder exists
@@ -195,7 +176,7 @@ def rpp_test_suite_parser_and_validator():
 
     # validate the parameters passed by user
     if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
-        print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!")
+        print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -237,7 +218,7 @@ def rpp_test_suite_parser_and_validator():
     else:
         for case in args.case_list:
             if int(case) < caseMin or int(case) > caseMax:
-                print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!")
+                print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
                 exit(0)
 
     return args
@@ -295,17 +276,17 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")    # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
 
 # Create folders based on testType and profilingOption
 if testType == 1 and profilingOption == "YES":
-    os.makedirs(f"{dstPath}/Tensor_PKD3")
-    os.makedirs(f"{dstPath}/Tensor_PLN1")
-    os.makedirs(f"{dstPath}/Tensor_PLN3")
+    os.makedirs(dstPath + "/Tensor_PKD3")
+    os.makedirs(dstPath + "/Tensor_PLN1")
+    os.makedirs(dstPath + "/Tensor_PLN3")
 
 print("\n\n\n\n\n")
 print("##########################################################################################")
@@ -319,8 +300,11 @@ def rpp_test_suite_parser_and_validator():
         if case == "82" and (("--input_path1" not in sys.argv and "--input_path2" not in sys.argv) or qaMode == 1):
             srcPath1 = ricapInFilePath
             srcPath2 = ricapInFilePath
+        if case == "26" and (("--input_path1" not in sys.argv and "--input_path2" not in sys.argv) or qaMode == 1):
+            srcPath1 = lensCorrectionInFilePath
+            srcPath2 = lensCorrectionInFilePath
         # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
-        if qaMode == 1 and case != "82":
+        if qaMode == 1 and (case != "82" and case != "26"):
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
         for layout in range(3):
@@ -343,6 +327,9 @@ def rpp_test_suite_parser_and_validator():
             if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
                 srcPath1 = ricapInFilePath
                 srcPath2 = ricapInFilePath
+            if case == "26" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
+                srcPath1 = lensCorrectionInFilePath
+                srcPath2 = lensCorrectionInFilePath
             for layout in range(3):
                 dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
@@ -357,6 +344,9 @@ def rpp_test_suite_parser_and_validator():
             if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
                 srcPath1 = ricapInFilePath
                 srcPath2 = ricapInFilePath
+            if case == "26" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
+                srcPath1 = lensCorrectionInFilePath
+                srcPath2 = lensCorrectionInFilePath
             for layout in range(3):
                 dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "hip", func_group_finder)
 
@@ -467,7 +457,7 @@ def rpp_test_suite_parser_and_validator():
                                 continue
 
             new_file.close()
-            subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"])  # nosec
+            subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"])  # nosec
         try:
             generate_performance_reports(d_counter, TYPE_LIST, RESULTS_DIR)
 
@@ -498,7 +488,7 @@ def rpp_test_suite_parser_and_validator():
         print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
 
 # print the results of qa tests
-nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
+nonQACaseList = ['6', '8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
     qaFilePath = os.path.join(outFilePath, "QA_results.txt")
diff --git a/utilities/test_suite/HIP/runVoxelTests.py b/utilities/test_suite/HIP/runVoxelTests.py
index b6f2ebe5a..31c9dd22f 100644
--- a/utilities/test_suite/HIP/runVoxelTests.py
+++ b/utilities/test_suite/HIP/runVoxelTests.py
@@ -57,20 +57,23 @@ def func_group_finder(case_number):
         return "miscellaneous"
 
 def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
-    print(result.stdout.decode())
+    print("./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth))
+    result = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+    stdout_data, stderr_data = result.communicate()
+    print(stdout_data.decode())
     print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
-        print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+   with open(loggingFolder + "/Tensor_voxel_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+        logFile.write("./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
         while True:
             output = process.stdout.readline()
             if not output and process.poll() is not None:
                 break
-            print(output.strip())
+            output = output.decode().strip()  # Decode bytes to string and strip extra whitespace
+            print(output)
+            logFile.write(output)
             if "Running" in output or "max,min,avg wall times" in output:
                 cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
                 cleanedOutput = cleanedOutput.strip()  # Remove leading/trailing whitespace
@@ -81,14 +84,15 @@ def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath,
 
 def run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
     layoutName = get_layout_name(layout)
-    if not os.path.exists(f"{loggingFolder}/Tensor_{layoutName}/case_{case}"):
-        os.mkdir(f"{loggingFolder}/Tensor_{layoutName}/case_{case}")
+    directory_path = os.path.join(loggingFolder, "Tensor_" + layoutName, "case_" + str(case))
+    if not os.path.exists(directory_path):
+        os.mkdir(directory_path)
 
     bitDepths = [0, 2]
     for bitDepth in bitDepths:
-        with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
-            print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_{layoutName}/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp}  {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-            process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
+        with open(loggingFolder + "/Tensor_voxel_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+            logFile.write("\nrocprof --basenames on --timestamp on --stats -o " + dstPathTemp + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n")
+            process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + ".csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)  # nosec
             while True:
                 output = process.stdout.readline()
                 if not output and process.poll() is not None:
@@ -119,6 +123,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    print_case_list(voxelAugmentationMap, "HIP", parser)
     args = parser.parse_args()
 
     # check if the folder exists
@@ -226,17 +231,17 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")  # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")  # nosec
 
 # List of cases supported
 supportedCaseList = ['0', '1', '2', '3', '4', '5', '6']
 
 # Create folders based on testType and profilingOption
 if testType == 1 and profilingOption == "YES":
-    os.makedirs(f"{dstPath}/Tensor_PKD3")
-    os.makedirs(f"{dstPath}/Tensor_PLN1")
-    os.makedirs(f"{dstPath}/Tensor_PLN3")
+    os.makedirs(dstPath + "/Tensor_PKD3")
+    os.makedirs(dstPath + "/Tensor_PLN1")
+    os.makedirs(dstPath + "/Tensor_PLN3")
 
 print("\n\n\n\n\n")
 print("##########################################################################################")
@@ -321,7 +326,7 @@ def rpp_test_suite_parser_and_validator():
                             continue
 
             new_file.close()
-            subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"])  # nosec
+            subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"])  # nosec
         try:
             generate_performance_reports(d_counter, TYPE_LIST, RESULTS_DIR)
 
diff --git a/utilities/test_suite/HOST/CMakeLists.txt b/utilities/test_suite/HOST/CMakeLists.txt
index 13a9b5d1c..02581329a 100644
--- a/utilities/test_suite/HOST/CMakeLists.txt
+++ b/utilities/test_suite/HOST/CMakeLists.txt
@@ -67,6 +67,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -ggdb -O0")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} OpenMP::OpenMP_CXX)
 
+# Audio Support
+option(RPP_AUDIO_SUPPORT      "Build RPP test suite with audio support"         ON)
+if(RPP_AUDIO_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAUDIO_SUPPORT=1")
+    message("-- ${Green}RPP_AUDIO_SUPPORT set to ON ${ColourReset}")
+else()
+    message("-- ${Yellow}RPP_AUDIO_SUPPORT (default=ON) was overrided. Skipping audio tests.${ColourReset}")
+endif()
+
 if(TurboJpeg_FOUND)
     message("-- ${Green}${PROJECT_NAME} set to build with rpp and TurboJpeg${ColourReset}")
     include_directories(${TurboJpeg_INCLUDE_DIRS})
@@ -89,6 +98,7 @@ if (OpenCV_FOUND)
 
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
     target_link_libraries(Tensor_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
+    target_link_libraries(Tensor_misc_host ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
     target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lturbojpeg -lrpp pthread ${LINK_LIBRARY_LIST})
@@ -110,19 +120,21 @@ else()
     message("-- ${Yellow}Warning: libniftiio must be installed to install ${PROJECT_NAME}/Tensor_voxel_host successfully!${ColourReset}")
 endif()
 
-if(NOT libsnd_LIBS)
-    message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}")
-else()
-    message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
-    include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
-    link_directories(${ROCM_PATH}/lib /usr/local/lib)
-    include_directories(${SndFile_INCLUDE_DIRS})
-    link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
-
-    add_executable(Tensor_audio_host Tensor_audio_host.cpp)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
-    if(NOT APPLE)
-        set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
+if(RPP_AUDIO_SUPPORT)
+    if(NOT libsnd_LIBS)
+        message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}")
+    else()
+        message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
+        include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
+        link_directories(${ROCM_PATH}/lib /usr/local/lib)
+        include_directories(${SndFile_INCLUDE_DIRS})
+        link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
+
+        add_executable(Tensor_audio_host Tensor_audio_host.cpp)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
+        if(NOT APPLE)
+            set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
+        endif()
+        target_link_libraries(Tensor_audio_host ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
     endif()
-    target_link_libraries(Tensor_audio_host ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
 endif()
diff --git a/utilities/test_suite/HOST/Tensor_audio_host.cpp b/utilities/test_suite/HOST/Tensor_audio_host.cpp
index 3ec2e0060..777b2ce43 100644
--- a/utilities/test_suite/HOST/Tensor_audio_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_audio_host.cpp
@@ -106,7 +106,10 @@ int main(int argc, char **argv)
     set_audio_descriptor_dims_and_strides(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, maxSrcChannels, offsetInBytes);
     int maxDstChannels = maxSrcChannels;
     if(testCase == 3)
+    {
+        srcDescPtr->numDims = 3;
         maxDstChannels = 1;
+    }
     set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
 
     // create generic descriptor in case of slice
@@ -266,6 +269,7 @@ int main(int argc, char **argv)
                     }
 
                     set_audio_descriptor_dims_and_strides_nostriding(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
+                    dstDescPtr->numDims = 3;
 
                     // Set buffer sizes for src/dst
                     unsigned long long spectrogramBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)dstDescPtr->c * (unsigned long long)dstDescPtr->n;
@@ -381,6 +385,8 @@ int main(int argc, char **argv)
 
                     set_audio_descriptor_dims_and_strides_nostriding(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, maxSrcChannels, offsetInBytes);
                     set_audio_descriptor_dims_and_strides_nostriding(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
+                    srcDescPtr->numDims = 3;
+                    dstDescPtr->numDims = 3;
 
                     // Set buffer sizes for src/dst
                     unsigned long long spectrogramBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)srcDescPtr->c * (unsigned long long)srcDescPtr->n;
@@ -422,7 +428,7 @@ int main(int argc, char **argv)
             if (testCase == 0)
                 verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
             else
-                verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath);
+                verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HOST");
 
             /* Dump the outputs to csv files for debugging
             Runs only if
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 5a4634c97..bb1312a5e 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -66,7 +66,8 @@ int main(int argc, char **argv)
 
     bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
     bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
-    bool randomOutputCase = (testCase == 8 || testCase == 84);
+    bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84);
+    bool nonQACase = (testCase == 24);
     bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
     bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
     bool noiseTypeCase = (testCase == 8);
@@ -103,7 +104,7 @@ int main(int argc, char **argv)
 
     if (layoutType == 2)
     {
-        if(testCase == 31 || testCase == 36 || testCase == 45 || testCase == 86)
+        if(testCase == 31 || testCase == 35 || testCase == 36 || testCase == 45 || testCase == 86)
         {
             printf("\ncase %d does not exist for PLN1 layout\n", testCase);
             return -1;
@@ -317,7 +318,7 @@ int main(int argc, char **argv)
     output = static_cast<Rpp8u *>(calloc(outputBufferSize, 1));
 
     Rpp32f *rowRemapTable, *colRemapTable;
-    if(testCase == 79)
+    if(testCase == 79 || testCase == 26)
     {
         rowRemapTable = static_cast<Rpp32f *>(calloc(ioBufferSize, sizeof(Rpp32f)));
         colRemapTable = static_cast<Rpp32f *>(calloc(ioBufferSize, sizeof(Rpp32f)));
@@ -517,6 +518,24 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 6:
+                {
+                    testCaseName = "jitter";
+
+                    Rpp32u kernelSizeTensor[batchSize];
+                    Rpp32u seed = 1255459;
+                    for (i = 0; i < batchSize; i++)
+                        kernelSizeTensor[i] = 5;
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_jitter_host(input, srcDescPtr, output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 8:
                 {
                     testCaseName = "noise";
@@ -672,6 +691,56 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 24:
+                {
+                    testCaseName = "warp_affine";
+
+                    if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR))
+                    {
+                        missingFuncFlag = 1;
+                        break;
+                    }
+
+                    Rpp32f6 affineTensor_f6[batchSize];
+                    Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6;
+                    for (i = 0; i < batchSize; i++)
+                    {
+                        affineTensor_f6[i].data[0] = 1.23;
+                        affineTensor_f6[i].data[1] = 0.5;
+                        affineTensor_f6[i].data[2] = 0;
+                        affineTensor_f6[i].data[3] = -0.8;
+                        affineTensor_f6[i].data[4] = 0.83;
+                        affineTensor_f6[i].data[5] = 0;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_warp_affine_host(input, srcDescPtr, output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
+                case 26:
+                {
+                    testCaseName = "lens_correction";
+
+                    Rpp32f cameraMatrix[9 * batchSize];
+                    Rpp32f distortionCoeffs[8 * batchSize];
+                    RpptDesc tableDesc = srcDesc;
+                    RpptDescPtr tableDescPtr = &tableDesc;
+                    init_lens_correction(batchSize, srcDescPtr, cameraMatrix, distortionCoeffs, tableDescPtr);
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_lens_correction_host(input, srcDescPtr, output, dstDescPtr, rowRemapTable, colRemapTable, tableDescPtr, cameraMatrix, distortionCoeffs, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 29:
                 {
                     testCaseName = "water";
@@ -821,6 +890,30 @@ int main(int argc, char **argv)
 
                     break;
                 }
+                case 35:
+                {
+                    testCaseName = "glitch";
+                    RpptChannelOffsets rgbOffsets[batchSize];
+
+                    for (i = 0; i < batchSize; i++)
+                    {
+                        rgbOffsets[i].r.x = 10;
+                        rgbOffsets[i].r.y = 10;
+                        rgbOffsets[i].g.x = 0;
+                        rgbOffsets[i].g.y = 0;
+                        rgbOffsets[i].b.x = 5;
+                        rgbOffsets[i].b.y = 5;
+                    }
+
+                    startWallTime = omp_get_wtime();
+                    startCpuTime = clock();
+                    if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+                        rppt_glitch_host(input, srcDescPtr, output, dstDescPtr, rgbOffsets, roiTensorPtrSrc, roiTypeSrc, handle);
+                    else
+                        missingFuncFlag = 1;
+
+                    break;
+                }
                 case 36:
                 {
                     testCaseName = "color_twist";
@@ -960,7 +1053,7 @@ int main(int argc, char **argv)
                 {
                     testCaseName = "color_temperature";
 
-                    Rpp8s adjustment[batchSize];
+                    Rpp32s adjustment[batchSize];
                     for (i = 0; i < batchSize; i++)
                         adjustment[i] = 70;
 
@@ -1419,7 +1512,7 @@ int main(int argc, char **argv)
                 1.QA Flag is set
                 2.input bit depth 0 (U8)
                 3.source and destination layout are the same*/
-                if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
+                if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase))
                 {
                     if (testCase == 87)
                         compare_reduction_output(static_cast<uint64_t *>(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
@@ -1485,7 +1578,7 @@ int main(int argc, char **argv)
                 2.input bit depth 0 (Input U8 && Output U8)
                 3.source and destination layout are the same
                 4.augmentation case does not generate random output*/
-                if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase))
+                if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase))
                     compare_output<Rpp8u>(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath);
 
                 // Calculate exact dstROI in XYWH format for OpenCV dump
diff --git a/utilities/test_suite/HOST/Tensor_misc_host.cpp b/utilities/test_suite/HOST/Tensor_misc_host.cpp
index 4efcd93fe..4cf644310 100644
--- a/utilities/test_suite/HOST/Tensor_misc_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_misc_host.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv)
     if (argc < MIN_ARG_COUNT)
     {
         printf("\nImproper Usage! Needs all arguments!\n");
-        printf("\nUsage: ./Tensor_normalize_host <case number = 0:0> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <dst path> <script path>\n");
+        printf("\nUsage: ./Tensor_misc_host <case number = 0:1> <test type 0/1> <toggle 0/1> <number of dimensions> <batch size> <num runs> <additional param> <dst path> <script path>\n");
         return -1;
     }
     Rpp32u testCase, testType, nDim, batchSize, numRuns, toggle, addi;
@@ -47,7 +47,9 @@ int main(int argc, char **argv)
     string scriptPath = argv[9];
     qaMode = (testType == 0);
     bool axisMaskCase = (testCase == 1);
-    int axisMask = (axisMaskCase) ? atoi(argv[7]) : 1;
+    bool permOrderCase = (testCase == 0);
+    int additionalParam = (axisMaskCase || permOrderCase) ? atoi(argv[7]) : 1;
+    int axisMask = additionalParam, permOrder = additionalParam;
 
     if (qaMode && batchSize != 3)
     {
@@ -70,6 +72,13 @@ int main(int argc, char **argv)
         func += "_" + std::to_string(nDim) + "d" + "_axisMask";
         func += additionalParam_char;
     }
+    if (permOrderCase)
+    {
+        char additionalParam_char[2];
+        std::sprintf(additionalParam_char, "%d", permOrder);
+        func += "_" + std::to_string(nDim) + "d" + "_permOrder";
+        func += additionalParam_char;
+    }
 
     // fill roi based on mode and number of dimensions
     Rpp32u *roiTensor = static_cast<Rpp32u *>(calloc(nDim * 2 * batchSize, sizeof(Rpp32u)));
@@ -96,7 +105,7 @@ int main(int argc, char **argv)
 
     // read input data
     if(qaMode)
-        read_data(inputF32, nDim, 0, scriptPath);
+        read_data(inputF32, nDim, 0, scriptPath, funcName);
     else
     {
         std::srand(0);
@@ -111,8 +120,11 @@ int main(int argc, char **argv)
     rppCreateWithBatchSize(&handle, batchSize, numThreads);
 
     Rpp32f *meanTensor = nullptr, *stdDevTensor = nullptr;
+    bool externalMeanStd = true;
+
     double startWallTime, endWallTime;
     double maxWallTime = 0, minWallTime = 500, avgWallTime = 0, wallTime = 0;
+    string testCaseName;
 
     // case-wise RPP API and measure time script for Unit and Performance test
     printf("\nRunning %s %d times (each time with a batch size of %d) and computing mean statistics...", func.c_str(), numRuns, batchSize);
@@ -120,13 +132,30 @@ int main(int argc, char **argv)
     {
         switch(testCase)
         {
+            case 0:
+            {
+                testCaseName  = "transpose";
+                Rpp32u permTensor[nDim];
+                fill_perm_values(nDim, permTensor, qaMode, permOrder);
+
+                for(int i = 1; i <= nDim; i++)
+                    dstDescriptorPtrND->dims[i] = roiTensor[nDim + permTensor[i - 1]];
+                compute_strides(dstDescriptorPtrND);
+
+                startWallTime = omp_get_wtime();
+                rppt_transpose_host(inputF32, srcDescriptorPtrND, outputF32, dstDescriptorPtrND, permTensor, roiTensor, handle);
+
+                break;
+            }
             case 1:
             {
+                testCaseName  = "normalize";
                 float scale = 1.0;
                 float shift = 0.0;
                 // computeMeanStddev set to 3 means both mean and stddev should be computed internally.
                 // Wherein 0th bit used to represent computeMean and 1st bit for computeStddev.
                 Rpp8u computeMeanStddev = 3;
+                externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
 
                 Rpp32u size = 1; // length of mean and stddev tensors differ based on axisMask and nDim
                 Rpp32u maxSize = 1;
@@ -151,12 +180,15 @@ int main(int argc, char **argv)
                 startWallTime = omp_get_wtime();
                 rppt_normalize_host(inputF32, srcDescriptorPtrND, outputF32, dstDescriptorPtrND, axisMask, meanTensor, stdDevTensor, computeMeanStddev, scale, shift, roiTensor, handle);
 
-                // compare outputs if qaMode is true
-                if(qaMode)
-                {
-                    bool externalMeanStd = !computeMeanStddev; // when mean and stddev is passed from user
-                    compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, axisMask, scriptPath, externalMeanStd);
-                }
+                break;
+            }
+            case 2:
+            {
+                testCaseName  = "log";
+
+                startWallTime = omp_get_wtime();
+                rppt_log_host(inputF32, srcDescriptorPtrND, outputF32, dstDescriptorPtrND, roiTensor, handle);
+
                 break;
             }
             default:
@@ -173,7 +205,11 @@ int main(int argc, char **argv)
         avgWallTime += wallTime;
     }
 
-    if(!qaMode)
+    if(qaMode)
+    {
+        compare_output(outputF32, nDim, batchSize, bufferSize, dst, func, testCaseName, additionalParam, scriptPath, externalMeanStd);
+    }
+    else
     {
         maxWallTime *= 1000;
         minWallTime *= 1000;
diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py
index 0d213a142..a1771716b 100644
--- a/utilities/test_suite/HOST/runAudioTests.py
+++ b/utilities/test_suite/HOST/runAudioTests.py
@@ -45,15 +45,16 @@ def get_log_file_list():
     ]
 
 def run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath):
-    print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
-    print(result.stdout.decode())
+    print("./Tensor_audio_host " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize))
+    result = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    stdout_data, stderr_data = result.communicate()
+    print(stdout_data.decode())
     print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
-    with open("{}/Tensor_audio_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
-        print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize} ")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+    with open(loggingFolder + "/Tensor_audio_host_raw_performance_log.txt", "a") as logFile:
+        logFile.write("./Tensor_audio_host " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize) + "\n")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
         read_from_subprocess_and_write_to_log(process, logFile)
         print("------------------------------------------------------------------------------------------")
 
@@ -79,6 +80,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )")
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    print_case_list(audioAugmentationMap, "HOST", parser)
     args = parser.parse_args()
 
     # check if the folder exists
@@ -86,7 +88,7 @@ def rpp_test_suite_parser_and_validator():
 
     # validate the parameters passed by user
     if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
-        print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!")
+        print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -119,7 +121,7 @@ def rpp_test_suite_parser_and_validator():
     else:
         for case in args.case_list:
             if int(case) < caseMin or int(case) > caseMax:
-                print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!")
+                print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
                 exit(0)
     return args
 
@@ -170,8 +172,8 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")    # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
 supportedCaseList = ['0', '1', '2', '3', '4', '5', '6', '7']
diff --git a/utilities/test_suite/HOST/runMiscTests.py b/utilities/test_suite/HOST/runMiscTests.py
index 553b6e35e..931838f71 100644
--- a/utilities/test_suite/HOST/runMiscTests.py
+++ b/utilities/test_suite/HOST/runMiscTests.py
@@ -37,8 +37,8 @@
 scriptPath = os.path.dirname(os.path.realpath(__file__))
 outFolderPath = os.getcwd()
 buildFolderPath = os.getcwd()
-caseMin = 1
-caseMax = 1
+caseMin = 0
+caseMax = 2
 
 # Get a list of log files based on a flag for preserving output
 def get_log_file_list():
@@ -47,18 +47,19 @@ def get_log_file_list():
     ]
 
 def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
-    print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
-    print(result.stdout.decode())
+    print("./Tensor_misc_host " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg))
+    result = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE)    # nosec
+    stdout_data, stderr_data = result.communicate()
+    print(stdout_data.decode())
     print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
-    with open("{}/Tensor_misc_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
-        print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+    with open(loggingFolder + "/Tensor_misc_host_raw_performance_log.txt", "a") as logFile:
+        logFile.write("./Tensor_misc_host " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
         read_from_subprocess_and_write_to_log(process, logFile)
 
-def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
+def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg = ""):
     print("\n\n\n\n")
     print("--------------------------------")
     print("Running a New Functionality...")
@@ -81,6 +82,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Outputs from tests will be compared with golden outputs - (0 / 1)", required = False)
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
+    print_case_list(miscAugmentationMap, "HOST", parser)
     args = parser.parse_args()
 
     # validate the parameters passed by user
@@ -161,18 +163,21 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")    # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")    # nosec
 
-supportedCaseList = ['1']
+supportedCaseList = ['0', '1', '2']
 for case in caseList:
     if case not in supportedCaseList:
         continue
-    if case == "1":
+    if case == "0":
+        for transposeOrder in range(1, numDims):
+            run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, transposeOrder)
+    elif case == "1":
         for axisMask in range(1, pow(2, numDims)):
             run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask)
     else:
-        run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, axisMask)
+        run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath)
 
 # print the results of qa tests
 nonQACaseList = []
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index 159eed640..7386b364b 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -34,6 +34,7 @@
 inFilePath1 = scriptPath + "/../TEST_IMAGES/three_images_mixed_src1"
 inFilePath2 = scriptPath + "/../TEST_IMAGES/three_images_mixed_src2"
 ricapInFilePath = scriptPath + "/../TEST_IMAGES/three_images_150x150_src1"
+lensCorrectionInFilePath = scriptPath + "/../TEST_IMAGES/lens_distortion"
 qaInputFile = scriptPath + "/../TEST_IMAGES/three_images_mixed_src1"
 perfQaInputFile = scriptPath + "/../TEST_IMAGES/eight_images_mixed_src1"
 outFolderPath = os.getcwd()
@@ -49,25 +50,6 @@ def get_log_file_list(preserveOutput):
         outFolderPath + "/OUTPUT_PERFORMANCE_LOGS_HOST_" + timestamp + "/Tensor_host_pln1_raw_performance_log.txt"
     ]
 
-# Functionality group finder
-def func_group_finder(case_number):
-    if case_number < 5 or case_number == 13 or case_number == 36 or case_number == 31 or case_number == 45:
-        return "color_augmentations"
-    elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84:
-        return "effects_augmentations"
-    elif case_number < 40 or case_number == 63:
-        return "geometric_augmentations"
-    elif case_number < 62:
-        return "arithmetic_operations"
-    elif case_number < 69:
-        return "logical_operations"
-    elif case_number < 87:
-        return "data_exchange_operations"
-    elif case_number < 88:
-        return "statistical_operations"
-    else:
-        return "miscellaneous"
-
 def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     print("\n\n\n\n")
     print("--------------------------------")
@@ -89,34 +71,37 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
             if case == "8":
                 # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
                 for noiseType in range(3):
-                    print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {noiseType} 0 ")
-                    result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                    print(result.stdout.decode())
+                    print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(noiseType) + " 0")
+                    result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)    # nosec
+                    stdout_data, stderr_data = result.communicate()
+                    print(stdout_data.decode())
             elif case == "21" or case == "23" or case == "24" or case == "79":
                 # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
                 interpolationRange = 6
                 if case =='79':
                     interpolationRange = 2
                 for interpolationType in range(interpolationRange):
-                    print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {interpolationType} 0")
-                    result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                    print(result.stdout.decode())
+                    print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(interpolationType) + " 0")
+                    result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)    # nosec
+                    stdout_data, stderr_data = result.communicate()
+                    print(stdout_data.decode())
             else:
-                print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} 0 {numRuns} {testType} {layout} 0")
-                result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
-                print(result.stdout.decode())
+                print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " 0 " + str(numRuns) + " " + str(testType) + " " + str(layout) + " 0")
+                result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE)    # nosec
+                stdout_data, stderr_data = result.communicate()
+                print(stdout_data.decode())
 
             print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
     if qaMode == 1:
-        with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
-            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+        with open(loggingFolder + "/BatchPD_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+            process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
             read_from_subprocess_and_write_to_log(process, logFile)
 
-    with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
-        print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)    # nosec
+    with open(loggingFolder + "/Tensor_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+        logFile.write("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam) + " 0\n")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)    # nosec
         read_from_subprocess_and_write_to_log(process, logFile)
 
 def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
@@ -161,6 +146,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
     parser.add_argument('--roi', nargs = 4, help = "specifies the roi values", required = False)
+    print_case_list(imageAugmentationMap, "HOST", parser)
     args = parser.parse_args()
 
     # check if the folder exists
@@ -171,7 +157,7 @@ def rpp_test_suite_parser_and_validator():
 
     # validate the parameters passed by user
     if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
-        print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!")
+        print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
         exit(0)
     elif args.case_end < args.case_start:
         print("Ending case# must be greater than starting case#. Aborting!")
@@ -210,7 +196,7 @@ def rpp_test_suite_parser_and_validator():
     else:
         for case in args.case_list:
             if int(case) < caseMin or int(case) > caseMax:
-                print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!")
+                print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
                 exit(0)
 
     return args
@@ -271,11 +257,11 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")    # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")    # nosec
 
 # List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '32', '33', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
 
 print("\n\n\n\n\n")
 print("##########################################################################################")
@@ -289,8 +275,11 @@ def rpp_test_suite_parser_and_validator():
         if case == "82" and (("--input_path1" not in sys.argv and "--input_path2" not in sys.argv) or qaMode == 1):
             srcPath1 = ricapInFilePath
             srcPath2 = ricapInFilePath
+        if case == "26" and (("--input_path1" not in sys.argv and "--input_path2" not in sys.argv) or qaMode == 1):
+            srcPath1 = lensCorrectionInFilePath
+            srcPath2 = lensCorrectionInFilePath
         # if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
-        if qaMode == 1 and case != "82":
+        if qaMode == 1 and (case != "82" and case != "26"):
             srcPath1 = inFilePath1
             srcPath2 = inFilePath2
         for layout in range(3):
@@ -315,12 +304,15 @@ def rpp_test_suite_parser_and_validator():
         if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
             srcPath1 = ricapInFilePath
             srcPath2 = ricapInFilePath
+        if case == "26" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
+            srcPath1 = lensCorrectionInFilePath
+            srcPath2 = lensCorrectionInFilePath
         for layout in range(3):
             dstPathTemp, logFileLayout = process_layout(layout, qaMode, case, dstPath, "host", func_group_finder)
             run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
 
 # print the results of qa tests
-nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
+nonQACaseList = ['6', '8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
 
 if qaMode and testType == 0:
     qaFilePath = os.path.join(outFilePath, "QA_results.txt")
@@ -454,23 +446,23 @@ def rpp_test_suite_parser_and_validator():
     passedCases = df['Test_Result'].eq('PASSED').sum()
     failedCases = df['Test_Result'].eq('FAILED').sum()
 
-    summaryRow = {'BatchPD_Augmentation_Type': pd.NA,
-                   'Tensor_Augmentation_Type': pd.NA,
-                   'Performance Speedup (%)': pd.NA,
-                   'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'}
+    summaryRow = {'BatchPD_Augmentation_Type': None,
+                   'Tensor_Augmentation_Type': None,
+                   'Performance Speedup (%)': None,
+                   'Test_Result': 'Final Results of Tests: Passed: ' + str(passedCases) + ', Failed: ' + str(failedCases)}
 
-    print("\n", df.to_markdown())
+    print("\n" + dataframe_to_markdown(df))
 
     # Append the summary row to the DataFrame
     # Convert the dictionary to a DataFrame
     summaryRow = pd.DataFrame([summaryRow])
-    df = pd.concat([df, summaryRow], ignore_index=True)
+    df = pd.concat([df, summaryRow], ignore_index=True, sort = True)
 
     df.to_excel(excelFilePath, index=False)
     print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
     print("\nIMPORTANT NOTE:")
     print("- The following performance comparison shows Performance Speedup percentages between times measured on previous generation RPP-BatchPD APIs against current generation RPP-Tensor APIs.")
-    print(f"- All APIs have been improved for performance ranging from {0}% (almost same) to {100}% faster.")
+    print("- All APIs have been improved for performance ranging from " + str(0) + "% (almost same) to " + str(100) + "% faster.")
     print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.")
     print("\n-------------------------------------------------------------------\n")
 elif (testType == 1 and qaMode == 0):
diff --git a/utilities/test_suite/HOST/runVoxelTests.py b/utilities/test_suite/HOST/runVoxelTests.py
index cf22fdbc9..f44c05f78 100644
--- a/utilities/test_suite/HOST/runVoxelTests.py
+++ b/utilities/test_suite/HOST/runVoxelTests.py
@@ -58,20 +58,23 @@ def func_group_finder(case_number):
         return "miscellaneous"
 
 def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-    result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
-    print(result.stdout.decode())
+    print("./Tensor_voxel_host " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth))
+    result = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+    stdout_data, stderr_data = result.communicate()
+    print(stdout_data.decode())
     print("------------------------------------------------------------------------------------------")
 
 def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
-    with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
-        print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
-        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+    with open(loggingFolder + "/Tensor_voxel_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+        logFile.write("./Tensor_voxel_host " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n")
+        process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
         while True:
             output = process.stdout.readline()
             if not output and process.poll() is not None:
                 break
-            print(output.strip())
+            output = output.decode().strip()  # Decode bytes to string and strip extra whitespace
+            print(output)
+            logFile.write(output)
             if "Running" in output or "max,min,avg wall times" in output:
                 cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126)  # Remove control characters
                 cleanedOutput = cleanedOutput.strip()  # Remove leading/trailing whitespace
@@ -99,6 +102,7 @@ def rpp_test_suite_parser_and_validator():
     parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
     parser.add_argument('--preserve_output', type = int, default = 1, help = "preserves the output of the program - (0 = override output / 1 = preserve output )" )
     parser.add_argument('--batch_size', type = int, default = 1, help = "Specifies the batch size to use for running tests. Default is 1.")
+    print_case_list(voxelAugmentationMap, "HOST", parser)
     args = parser.parse_args()
 
     # check if the folder exists
@@ -202,8 +206,8 @@ def rpp_test_suite_parser_and_validator():
 os.chdir(buildFolderPath + "/build")
 
 # Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".")   # nosec
-subprocess.run(["make", "-j16"], cwd=".")  # nosec
+subprocess.call(["cmake", scriptPath], cwd=".")   # nosec
+subprocess.call(["make", "-j16"], cwd=".")  # nosec
 
 # List of cases supported
 supportedCaseList = ['0', '1', '2', '3', '4', '5', '6']
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index 1126ea148..b0c235b44 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -117,7 +117,7 @@ The test suite can be run with the following command:
 python runTests.py --input_path1 <input_path1> --input_path2 <input_path2> --case_start <case_start> --case_end <case_end> --test_type <test_type> --profiling <profiling>
 ```
 
-## Modes of operation (Rpp Image Test Suite)
+### Modes of operation (Rpp Image Test Suite)
 -   QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3.
 Note: QA mode is not supported for case 84 due to run-to-run variation of outputs.
 ``` python
@@ -127,14 +127,6 @@ python runTests.py --case_start 0 --case_end 91 --test_type 0 --qa_mode 1 --batc
 ``` python
 python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
 ```
--   QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
-``` python
-python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
-```
--   QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
-``` python
-python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8 --num_runs 100
-```
 -   Unit test mode - Unit tests allowing users to pass a path to a folder containing images, to execute the desired functionality and variant once, report RPP execution wall time, save and view output images
 Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
       RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
@@ -290,11 +282,11 @@ To run the QA tests / performance tests for specific case numbers. please case u
 
 -   To run QA tests for case numbers 0, 1, 2
 ``` python
-python runTests.py --case_list 0 1 2 --qa_mode 1 --batch_size 3
+python runAudioTests.py --case_list 0 1 2 --qa_mode 1 --batch_size 3
 ```
 -   To run performance tests for case numbers 0, 1, 2
 ``` python
-python runTests.py --case_list 0 1 2 --test_type 1
+python runAudioTests.py --case_list 0 1 2 --test_type 1
 ```
 
 ### Summary of features (RPP Audio Test Suite)
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/glitch/glitch_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/glitch/glitch_u8_Tensor.bin
new file mode 100644
index 000000000..14cd3c64b
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/glitch/glitch_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/lens_correction/lens_correction_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/lens_correction/lens_correction_u8_Tensor.bin
new file mode 100644
index 000000000..e79550932
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/lens_correction/lens_correction_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_MISC/log/2d_output.bin b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/log/2d_output.bin
new file mode 100644
index 000000000..5386f9201
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/log/2d_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_MISC/log/3d_output.bin b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/log/3d_output.bin
new file mode 100644
index 000000000..66e42b834
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/log/3d_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_MISC/normalize/2d_output.bin b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/normalize/2d_output.bin
new file mode 100644
index 000000000..0f97d6bcf
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/normalize/2d_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_MISC/normalize/3d_output.bin b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/normalize/3d_output.bin
new file mode 100644
index 000000000..e9f31134a
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/normalize/3d_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_MISC/transpose/2d_output.bin b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/transpose/2d_output.bin
new file mode 100644
index 000000000..a801ef5e9
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/transpose/2d_output.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUTS_MISC/transpose/3d_output.bin b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/transpose/3d_output.bin
new file mode 100644
index 000000000..d1c1ca8fb
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUTS_MISC/transpose/3d_output.bin differ
diff --git a/utilities/test_suite/TEST_IMAGES/lens_distortion/sample1.jpg b/utilities/test_suite/TEST_IMAGES/lens_distortion/sample1.jpg
new file mode 100644
index 000000000..0fe764603
Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/lens_distortion/sample1.jpg differ
diff --git a/utilities/test_suite/TEST_IMAGES/lens_distortion/sample2.jpg b/utilities/test_suite/TEST_IMAGES/lens_distortion/sample2.jpg
new file mode 100644
index 000000000..5d17e9572
Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/lens_distortion/sample2.jpg differ
diff --git a/utilities/test_suite/TEST_IMAGES/lens_distortion/sample3.jpg b/utilities/test_suite/TEST_IMAGES/lens_distortion/sample3.jpg
new file mode 100644
index 000000000..897955d77
Binary files /dev/null and b/utilities/test_suite/TEST_IMAGES/lens_distortion/sample3.jpg differ
diff --git a/utilities/test_suite/TEST_MISC_FILES/2d_input.bin b/utilities/test_suite/TEST_MISC_FILES/2d_input.bin
new file mode 100644
index 000000000..e344a605a
Binary files /dev/null and b/utilities/test_suite/TEST_MISC_FILES/2d_input.bin differ
diff --git a/utilities/test_suite/TEST_MISC_FILES/2d_mean_std.bin b/utilities/test_suite/TEST_MISC_FILES/2d_mean_std.bin
new file mode 100644
index 000000000..d50768a58
Binary files /dev/null and b/utilities/test_suite/TEST_MISC_FILES/2d_mean_std.bin differ
diff --git a/utilities/test_suite/TEST_MISC_FILES/3d_input.bin b/utilities/test_suite/TEST_MISC_FILES/3d_input.bin
new file mode 100644
index 000000000..c3829ec21
Binary files /dev/null and b/utilities/test_suite/TEST_MISC_FILES/3d_input.bin differ
diff --git a/utilities/test_suite/TEST_MISC_FILES/3d_mean_std.bin b/utilities/test_suite/TEST_MISC_FILES/3d_mean_std.bin
new file mode 100644
index 000000000..aed068df8
Binary files /dev/null and b/utilities/test_suite/TEST_MISC_FILES/3d_mean_std.bin differ
diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py
index 3795a5daf..a0f37ffa2 100644
--- a/utilities/test_suite/common.py
+++ b/utilities/test_suite/common.py
@@ -27,6 +27,94 @@
 import sys
 import datetime
 import shutil
+import pandas as pd
+
+try:
+    from errno import FileExistsError
+except ImportError:
+    # Python 2 compatibility
+    FileExistsError = OSError
+
+imageAugmentationMap = {
+    0: ["brightness", "HOST", "HIP"],
+    1: ["gamma_correction", "HOST", "HIP"],
+    2: ["blend", "HOST", "HIP"],
+    4: ["contrast", "HOST", "HIP"],
+    8: ["noise", "HOST", "HIP"],
+    13: ["exposure", "HOST", "HIP"],
+    20: ["flip", "HOST", "HIP"],
+    21: ["resize", "HOST", "HIP"],
+    23: ["rotate", "HOST", "HIP"],
+    29: ["water", "HOST", "HIP"],
+    30: ["non_linear_blend", "HOST", "HIP"],
+    31: ["color_cast", "HOST", "HIP"],
+    32: ["erase", "HOST", "HIP"],
+    33: ["crop_and_patch", "HOST", "HIP"],
+    34: ["lut", "HOST", "HIP"],
+    36: ["color_twist", "HOST", "HIP"],
+    37: ["crop", "HOST", "HIP"],
+    38: ["crop_mirror_normalize", "HOST", "HIP"],
+    39: ["resize_crop_mirror", "HOST", "HIP"],
+    45: ["color_temperature", "HOST", "HIP"],
+    46: ["vignette", "HOST", "HIP"],
+    49: ["box_filter", "HIP"],
+    54: ["gaussian_filter", "HIP"],
+    61: ["magnitude", "HOST", "HIP"],
+    63: ["phase", "HOST", "HIP"],
+    65: ["bitwise_and", "HOST", "HIP"],
+    68: ["bitwise_or", "HOST", "HIP"],
+    70: ["copy", "HOST", "HIP"],
+    79: ["remap", "HOST", "HIP"],
+    80: ["resize_mirror_normalize", "HOST", "HIP"],
+    81: ["color_jitter", "HOST"],
+    82: ["ricap", "HOST", "HIP"],
+    83: ["gridmask", "HOST", "HIP"],
+    84: ["spatter", "HOST", "HIP"],
+    85: ["swap_channels", "HOST", "HIP"],
+    86: ["color_to_greyscale", "HOST", "HIP"],
+    87: ["tensor_sum", "HOST", "HIP"],
+    88: ["tensor_min", "HOST", "HIP"],
+    89: ["tensor_max", "HOST", "HIP"],
+    90: ["tensor_mean", "HOST", "HIP"],
+    91: ["tensor_stddev", "HOST", "HIP"],
+    92: ["slice", "HOST", "HIP"]
+}
+
+audioAugmentationMap = {
+    0: ["non_silent_region_detection", "HOST"],
+    1: ["to_decibels", "HOST"],
+    2: ["pre_emphasis_filter", "HOST"],
+    3: ["down_mixing", "HOST"],
+    4: ["spectrogram", "HOST"],
+    5: ["slice", "HOST"],
+    6: ["resample", "HOST"],
+    7: ["mel_filter_bank", "HOST"]
+}
+
+voxelAugmentationMap = {
+    0: ["fused_multiply_add_scalar", "HOST", "HIP"],
+    1: ["slice", "HOST", "HIP"],
+    2: ["add_scalar", "HOST", "HIP"],
+    3: ["subtract_scalar", "HOST", "HIP"],
+    4: ["flip_voxel", "HOST", "HIP"],
+    5: ["multiply_scalar", "HOST", "HIP"],
+    6: ["gaussian_noise_voxel", "HOST", "HIP"]
+}
+
+miscAugmentationMap  = {
+    1: ["normalize", "HOST", "HIP"]
+}
+
+ImageAugmentationGroupMap = {
+    "color_augmentations" : [0, 1, 2, 3, 4, 13, 31, 34, 36, 45, 81],
+    "effects_augmentations" : [8, 29, 30, 32, 35, 46, 82, 83, 84],
+    "geometric_augmentations" : [20, 21, 23, 33, 37, 38, 39, 63, 79, 80, 92],
+    "filter_augmentations" : [49, 54],
+    "arithmetic_operations" : [61],
+    "logical_operations" : [65, 68],
+    "data_exchange_operations" : [70, 85, 86],
+    "statistical_operations" : [87, 88, 89, 90, 91]
+}
 
 # Checks if the folder path is empty, or is it a root folder, or if it exists, and remove its contents
 def validate_and_remove_files(path):
@@ -92,7 +180,7 @@ def case_file_check(CASE_FILE_PATH, TYPE, TENSOR_TYPE_LIST, new_file, d_counter)
 def directory_name_generator(qaMode, affinity, layoutType, case, path, func_group_finder):
     if qaMode == 0:
         functionality_group = func_group_finder(int(case))
-        dst_folder_temp = f"{path}/rpp_{affinity}_{layoutType}_{functionality_group}"
+        dst_folder_temp = path + "/rpp_" + affinity + "_" + layoutType + "_" + functionality_group
     else:
         dst_folder_temp = path
 
@@ -237,8 +325,9 @@ def read_from_subprocess_and_write_to_log(process, logFile):
         output = process.stdout.readline()
         if not output and process.poll() is not None:
             break
-        print(output.strip())
-        logFile.write(output)
+        output = output.decode().strip()  # Decode bytes to string and strip extra whitespace
+        print(output)
+        logFile.write(output + '\n')
 
 # Returns the layout name based on layout value
 def get_layout_name(layout):
@@ -248,3 +337,46 @@ def get_layout_name(layout):
         return "PLN3"
     elif layout == 2:
         return "PLN1"
+
+# Prints entire case list if user asks for help
+def print_case_list(imageAugmentationMap, backendType, parser):
+    if '--help' in sys.argv or '-h' in sys.argv:
+        parser.print_help()
+        print("\n" + "="*30)
+        print("Functionality Reference List")
+        print("="*30 + "\n")
+        header_format = "{:<12} {:<15}"
+        print(header_format.format("CaseNumber", "Functionality"))
+        print("-" * 27)
+        row_format = "{:<12} {:<15}"
+        for key, value_list in imageAugmentationMap.items():
+            if backendType in value_list:
+                print(row_format.format(key, value_list[0]))
+
+        sys.exit(0)
+
+# Functionality group finder
+def func_group_finder(case_number):
+    for key, value in ImageAugmentationGroupMap.items():
+        if case_number in value:
+            return key
+    return "miscellaneous"
+
+def dataframe_to_markdown(df):
+    # Calculate the maximum width of each column
+    column_widths = {}
+    for col in df.columns:
+        max_length = len(col)
+        for value in df[col]:
+            max_length = max(max_length, len(str(value)))
+        column_widths[col] = max_length
+
+    # Create the header row
+    md = '| ' + ' | '.join([col.ljust(column_widths[col]) for col in df.columns]) + ' |\n'
+    md += '| ' + ' | '.join(['-' * column_widths[col] for col in df.columns]) + ' |\n'
+
+    # Create the data rows
+    for i, row in df.iterrows():
+        md += '| ' + ' | '.join([str(value).ljust(column_widths[df.columns[j]]) for j, value in enumerate(row.values)]) + ' |\n'
+
+    return md
diff --git a/utilities/test_suite/rpp_test_suite_audio.h b/utilities/test_suite/rpp_test_suite_audio.h
index c70c6659f..f2d10f7b6 100644
--- a/utilities/test_suite/rpp_test_suite_audio.h
+++ b/utilities/test_suite/rpp_test_suite_audio.h
@@ -25,11 +25,6 @@ SOFTWARE.
 #include "rpp_test_suite_common.h"
 #include <iomanip>
 #include <vector>
-#include <half/half.hpp>
-
-using half_float::half;
-using namespace std;
-typedef half Rpp16f;
 
 // Include this header file to use functions from libsndfile
 #include <sndfile.h>
@@ -57,7 +52,7 @@ std::map<string, std::vector<int>> NonSilentRegionReferenceOutputs =
 // sets descriptor dimensions and strides of src/dst
 inline void set_audio_descriptor_dims_and_strides(RpptDescPtr descPtr, int batchSize, int maxHeight, int maxWidth, int maxChannels, int offsetInBytes)
 {
-    descPtr->numDims = 4;
+    descPtr->numDims = 2;
     descPtr->offsetInBytes = offsetInBytes;
     descPtr->n = batchSize;
     descPtr->h = maxHeight;
@@ -75,7 +70,7 @@ inline void set_audio_descriptor_dims_and_strides(RpptDescPtr descPtr, int batch
 // sets descriptor dimensions and strides of src/dst
 inline void set_audio_descriptor_dims_and_strides_nostriding(RpptDescPtr descPtr, int batchSize, int maxHeight, int maxWidth, int maxChannels, int offsetInBytes)
 {
-    descPtr->numDims = 4;
+    descPtr->numDims = 2;
     descPtr->offsetInBytes = offsetInBytes;
     descPtr->n = batchSize;
     descPtr->h = maxHeight;
@@ -194,7 +189,7 @@ void read_from_bin_file(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcDims,
     free(refInput);
 }
 
-void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dstDims, string testCase, string dst, string scriptPath)
+void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dstDims, string testCase, string dst, string scriptPath, string backend)
 {
     fstream refFile;
     int fileMatch = 0;
@@ -222,6 +217,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst
         std::cout<<"\nCould not open the reference output. Please check the path specified\n";
         return;
     }
+    double cutoff = (backend == "HOST") ? 1e-20 : 1e-6;
 
     // iterate over all samples in a batch and compare with reference outputs
     for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
@@ -245,7 +241,7 @@ void verify_output(Rpp32f *dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr dst
                 refVal = refPtrTemp[j];
                 outVal = dstPtrTemp[j];
                 bool invalidComparision = ((outVal == 0.0f) && (refVal != 0.0f));
-                if (!invalidComparision && abs(outVal - refVal) < 1e-20)
+                if (!invalidComparision && abs(outVal - refVal) < cutoff)
                     matchedIndices += 1;
             }
             dstPtrRow += hStride;
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 517bebd4d..eddf78702 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -58,6 +58,8 @@ using namespace std;
 #define MAX_BATCH_SIZE 512
 #define GOLDEN_OUTPUT_MAX_HEIGHT 150    // Golden outputs are generated with MAX_HEIGHT set to 150. Changing this constant will result in QA test failures
 #define GOLDEN_OUTPUT_MAX_WIDTH 150     // Golden outputs are generated with MAX_WIDTH set to 150. Changing this constant will result in QA test failures
+#define LENS_CORRECTION_GOLDEN_OUTPUT_MAX_HEIGHT 480    // Lens correction golden outputs are generated with MAX_HEIGHT set to 480. Changing this constant will result in QA test failures
+#define LENS_CORRECTION_GOLDEN_OUTPUT_MAX_WIDTH 640     // Lens correction golden outputs are generated with MAX_WIDTH set to 640. Changing this constant will result in QA test failures
 
 #define CHECK_RETURN_STATUS(x) do { \
     int retval = (x); \
@@ -73,17 +75,21 @@ std::map<int, string> augmentationMap =
     {1, "gamma_correction"},
     {2, "blend"},
     {4, "contrast"},
+    {6, "jitter"},
     {8, "noise"},
     {13, "exposure"},
     {20, "flip"},
     {21, "resize"},
     {23, "rotate"},
+    {24, "warp_afffine"},
+    {26, "lens_correction"},
     {29, "water"},
     {30, "non_linear_blend"},
     {31, "color_cast"},
     {32, "erase"},
     {33, "crop_and_patch"},
     {34, "lut"},
+    {35, "glitch"},
     {36, "color_twist"},
     {37, "crop"},
     {38, "crop_mirror_normalize"},
@@ -905,7 +911,7 @@ inline void read_image_batch_turbojpeg(Rpp8u *input, RpptDescPtr descPtr, vector
         fseek(fp, 0, SEEK_END);
         long jpegSize = ftell(fp);
         rewind(fp);
-        unsigned char* jpegBuf = (unsigned char*)malloc(jpegSize);
+        unsigned char* jpegBuf = (unsigned char*)calloc(jpegSize, sizeof(Rpp8u));
         fread(jpegBuf, 1, jpegSize, fp);
         fclose(fp);
 
@@ -918,14 +924,14 @@ inline void read_image_batch_turbojpeg(Rpp8u *input, RpptDescPtr descPtr, vector
         if(descPtr->c == 3)
         {
             elementsInRow = width * descPtr->c;
-            rgbBuf= (Rpp8u*)malloc(width * height * 3);
+            rgbBuf= (Rpp8u*)calloc(width * height * 3, sizeof(Rpp8u));
             if(tjDecompress2(m_jpegDecompressor, jpegBuf, jpegSize, rgbBuf, width, width * 3, height, TJPF_RGB, TJFLAG_ACCURATEDCT) != 0)
                 std::cerr << "\n Jpeg image decode failed ";
         }
         else
         {
             elementsInRow = width;
-            rgbBuf= (Rpp8u*)malloc(width * height);
+            rgbBuf= (Rpp8u*)calloc(width * height, sizeof(Rpp8u));
             if(tjDecompress2(m_jpegDecompressor, jpegBuf, jpegSize, rgbBuf, width, width, height, TJPF_GRAY, 0) != 0)
                 std::cerr << "\n Jpeg image decode failed ";
         }
@@ -1092,8 +1098,17 @@ inline void compare_output(T* output, string funcName, RpptDescPtr srcDescPtr, R
 {
     string func = funcName;
     string refFile = "";
-    int refOutputWidth = ((GOLDEN_OUTPUT_MAX_WIDTH / 8) * 8) + 8;    // obtain next multiple of 8 after GOLDEN_OUTPUT_MAX_WIDTH
-    int refOutputHeight = GOLDEN_OUTPUT_MAX_HEIGHT;
+    int refOutputWidth, refOutputHeight;
+    if(testCase == 26)
+    {
+        refOutputWidth = ((LENS_CORRECTION_GOLDEN_OUTPUT_MAX_WIDTH / 8) * 8) + 8;    // obtain next multiple of 8 after GOLDEN_OUTPUT_MAX_WIDTH
+        refOutputHeight = LENS_CORRECTION_GOLDEN_OUTPUT_MAX_HEIGHT;
+    }
+    else
+    {
+        refOutputWidth = ((GOLDEN_OUTPUT_MAX_WIDTH / 8) * 8) + 8;    // obtain next multiple of 8 after GOLDEN_OUTPUT_MAX_WIDTH
+        refOutputHeight = GOLDEN_OUTPUT_MAX_HEIGHT;
+    }
     int refOutputSize = refOutputHeight * refOutputWidth * dstDescPtr->c;
     Rpp64u binOutputSize = refOutputHeight * refOutputWidth * dstDescPtr->n * 4;
     int pln1RefStride = dstDescPtr->strides.nStride * dstDescPtr->n * 3;
@@ -1526,3 +1541,24 @@ void inline init_erase(int batchSize, int boxesInEachImage, Rpp32u* numOfBoxes,
         }
     }
 }
+
+// Lens correction initializer for unit and performance testing
+void inline init_lens_correction(int batchSize, RpptDescPtr srcDescPtr, Rpp32f *cameraMatrix, Rpp32f *distortionCoeffs, RpptDescPtr tableDescPtr)
+{
+    typedef struct { Rpp32f data[9]; } Rpp32f9;
+    typedef struct { Rpp32f data[8]; } Rpp32f8;
+    Rpp32f9 *cameraMatrix_f9 = reinterpret_cast<Rpp32f9 *>(cameraMatrix);
+    Rpp32f8 *distortionCoeffs_f8 = reinterpret_cast<Rpp32f8 *>(distortionCoeffs);
+    Rpp32f9 sampleCameraMatrix = {534.07088364, 0, 341.53407554, 0, 534.11914595, 232.94565259, 0, 0, 1};
+    Rpp32f8 sampleDistortionCoeffs = {-0.29297164, 0.10770696, 0.00131038, -0.0000311, 0.0434798, 0, 0, 0};
+    for (int i = 0; i < batchSize; i++)
+    {
+        cameraMatrix_f9[i] = sampleCameraMatrix;
+        distortionCoeffs_f8[i] = sampleDistortionCoeffs;
+    }
+
+    tableDescPtr->c = 1;
+    tableDescPtr->strides.nStride = srcDescPtr->h * srcDescPtr->w;
+    tableDescPtr->strides.hStride = srcDescPtr->w;
+    tableDescPtr->strides.wStride = tableDescPtr->strides.cStride = 1;
+}
diff --git a/utilities/test_suite/rpp_test_suite_misc.h b/utilities/test_suite/rpp_test_suite_misc.h
index 111dac74e..9ef118c48 100644
--- a/utilities/test_suite/rpp_test_suite_misc.h
+++ b/utilities/test_suite/rpp_test_suite_misc.h
@@ -28,9 +28,12 @@ using namespace std;
 
 std::map<int, string> augmentationMiscMap =
 {
-    {1, "normalize"}
+    {0, "transpose"},
+    {1, "normalize"},
+    {2, "log"}
 };
 
+// Compute strides given Generic Tensor
 void compute_strides(RpptGenericDescPtr descriptorPtr)
 {
     if (descriptorPtr->numDims > 0)
@@ -45,37 +48,37 @@ void compute_strides(RpptGenericDescPtr descriptorPtr)
     }
 }
 
-string get_path(Rpp32u nDim, Rpp32u readType, string scriptPath, bool isMeanStd = false)
+
+// Retrieve path for bin file
+string get_path(Rpp32u nDim, Rpp32u readType, string scriptPath, string testCase, bool isMeanStd = false)
 {
-    string folderName, suffix;
+    string folderPath, suffix;
     if(readType == 0)
     {
-        folderName = "input";
-        if(isMeanStd)
-            suffix = "mean_std";
-        else
-            suffix = "input";
+        suffix = (isMeanStd) ? "mean_std" : "input";
+        folderPath = "/../TEST_MISC_FILES/";
     }
     else if(readType == 1)
     {
-        folderName = "output";
-        suffix = "output";
+        suffix = (isMeanStd) ? "mean_std" : "output";
+        folderPath = "/../REFERENCE_OUTPUTS_MISC/" + testCase + "/";
     }
 
     string fileName = std::to_string(nDim) + "d_" + suffix + ".bin";
-    string finalPath = scriptPath + "/../NORMALIZE/" + folderName + "/" + fileName;
+    string finalPath = scriptPath + folderPath + fileName;
     return finalPath;
 }
 
-void read_data(Rpp32f *data, Rpp32u nDim, Rpp32u readType, string scriptPath, bool isMeanStd = false)
+// Read data from Bin file
+void read_data(Rpp32f *data, Rpp32u nDim, Rpp32u readType, string scriptPath, string testCase, bool isMeanStd = false)
 {
     if(nDim != 2 && nDim != 3)
     {
         std::cout<<"\nGolden Inputs / Outputs are generated only for 2D/3D data"<<std::endl;
         exit(0);
     }
-    string refPath = get_path(nDim, readType, scriptPath, isMeanStd);
-    read_bin_file(refPath, data);
+    string dataPath = get_path(nDim, readType, scriptPath, testCase, isMeanStd);
+    read_bin_file(dataPath, data);
 }
 
 // Fill the starting indices and length of ROI values
@@ -98,10 +101,6 @@ void fill_roi_values(Rpp32u nDim, Rpp32u batchSize, Rpp32u *roiTensor, bool qaMo
                 for(int i = 0, j = 0; i < batchSize ; i++, j += 6)
                     std::copy(roi.begin(), roi.end(), &roiTensor[j]);
                 break;
-            }
-            default:
-            {
-                cout << "Error! QA mode is supported only for 2D/3D inputs" << endl;
                 exit(0);
             }
         }
@@ -265,7 +264,7 @@ void fill_mean_stddev_values(Rpp32u nDim, Rpp32u size, Rpp32f *meanTensor,
         }
         std::vector<Rpp32f> paramBuf(numValues * 2);
         Rpp32f *data = paramBuf.data();
-        read_data(data, nDim, 0, scriptPath, true);
+        read_data(data, nDim, 0, scriptPath, "normalize", true);
         memcpy(meanTensor, data + paramStride, size * sizeof(Rpp32f));
         memcpy(stdDevTensor, data + numValues + paramStride, size * sizeof(Rpp32f));
     }
@@ -279,30 +278,88 @@ void fill_mean_stddev_values(Rpp32u nDim, Rpp32u size, Rpp32f *meanTensor,
     }
 }
 
-Rpp32u get_bin_size(Rpp32u nDim, Rpp32u readType, string scriptPath)
+// fill the permutation values used for transpose
+void fill_perm_values(Rpp32u nDim, Rpp32u *permTensor, bool qaMode, int permOrder)
 {
-    string refFile = get_path(nDim, readType, scriptPath);
+    if(qaMode)
+    {
+        switch(nDim)
+        {
+            case 2:
+            {
+                // HW->WH
+                permTensor[0] = 1;
+                permTensor[1] = 0;
+                break;
+            }
+            case 3:
+            {
+                // HWC->WHC
+                if (permOrder == 1)
+                {
+                    permTensor[0] = 1;
+                    permTensor[1] = 0;
+                    permTensor[2] = 2;
+                }
+                // HWC->HCW
+                else if (permOrder == 2)
+                {
+                    permTensor[0] = 0;
+                    permTensor[1] = 2;
+                    permTensor[2] = 1;
+
+                }
+                break;
+            }
+            default:
+            {
+                cout << "Error! QA mode is supported only for 2D / 3D inputs" << endl;
+                exit(0);
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < nDim; i++)
+            permTensor[i] = nDim - 1 - i;
+    }
+}
+
+Rpp32u get_bin_size(Rpp32u nDim, Rpp32u readType, string scriptPath, string testCase)
+{
+    string refFile = get_path(nDim, readType, scriptPath, testCase);
     std::ifstream filestream(refFile, ios_base::in | ios_base::binary);
     filestream.seekg(0, ios_base::end);
     Rpp32u filesize = filestream.tellg();
     return filesize;
 }
 
-void compare_output(Rpp32f *outputF32, Rpp32u nDim, Rpp32u batchSize, Rpp32u bufferLength,
-                    string dst, string funcName, int axisMask, string scriptPath, bool isMeanStd = false)
+// Compares output with reference outputs and validates QA
+void compare_output(Rpp32f *outputF32, Rpp32u nDim, Rpp32u batchSize, Rpp32u bufferLength, string dst,
+                    string funcName, string testCase, int additionalParam, string scriptPath, bool isMeanStd = false)
 {
-    Rpp32u goldenOutputLength = get_bin_size(nDim, 1, scriptPath);
+    Rpp32u goldenOutputLength = get_bin_size(nDim, 1, scriptPath, testCase);
     Rpp32f *refOutput = static_cast<Rpp32f *>(calloc(goldenOutputLength, 1));
-    read_data(refOutput, nDim, 1, scriptPath);
-    int meanStdDevOutputStride = 0;
-    if(isMeanStd)
-        meanStdDevOutputStride = goldenOutputLength / (2 * sizeof(Rpp32f));
-    int axisMaskStride = (axisMask - 1) * bufferLength;
+    read_data(refOutput, nDim, 1, scriptPath, testCase);
+    int subVariantStride = 0;
+    if (testCase == "normalize")
+    {
+        int meanStdDevOutputStride = 0, axisMaskStride = 0;
+        if(isMeanStd)
+            meanStdDevOutputStride = goldenOutputLength / (2 * sizeof(Rpp32f));
+        axisMaskStride = (additionalParam - 1) * bufferLength;
+        subVariantStride = meanStdDevOutputStride + axisMaskStride;
+    }
+    else if (testCase == "transpose")
+    {
+        subVariantStride = (additionalParam - 1) * bufferLength;
+    }
+
     int sampleLength = bufferLength / batchSize;
     int fileMatch = 0;
     for(int i = 0; i < batchSize; i++)
     {
-        Rpp32f *ref = refOutput + meanStdDevOutputStride + axisMaskStride + i * sampleLength;
+        Rpp32f *ref = refOutput + subVariantStride + i * sampleLength;
         Rpp32f *out = outputF32 + i * sampleLength;
         int cnt = 0;
         for(int j = 0; j < sampleLength; j++)