diff --git a/.Doxyfile b/.Doxyfile
index 066a53c02..dac8a3acc 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -960,16 +960,16 @@ INPUT = README.md \
include/rppi_logical_operations.h \
include/rppi_morphological_transforms.h \
include/rppi_statistical_operations.h \
+ include/rppt_tensor_arithmetic_operations.h \
+ include/rppt_tensor_audio_augmentations.h \
include/rppt_tensor_color_augmentations.h \
include/rppt_tensor_data_exchange_operations.h \
include/rppt_tensor_effects_augmentations.h \
include/rppt_tensor_filter_augmentations.h \
include/rppt_tensor_geometric_augmentations.h \
+ include/rppt_tensor_logical_operations.h \
include/rppt_tensor_morphological_operations.h \
- include/rppt_tensor_statistical_operations.h \
- include/rppt_tensor_arithmetic_operations.h \
- include/rppt_tensor_audio_augmentations.h \
- include/rppt_tensor_logical_operations.h
+ include/rppt_tensor_statistical_operations.h
# This tag can be used to specify the character encoding of the source files
@@ -2381,7 +2381,7 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE
+PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
index 3a33cebd7..1e11589cd 100644
--- a/.azuredevops/rocm-ci.yml
+++ b/.azuredevops/rocm-ci.yml
@@ -13,6 +13,8 @@ trigger:
batch: true
branches:
include:
+ - develop
+ - mainline
- master
paths:
exclude:
@@ -27,8 +29,9 @@ pr:
autoCancel: true
branches:
include:
- - master
- develop
+ - mainline
+ - master
paths:
exclude:
- .github
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16c4251f4..ca19c7eb0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,8 @@
# Changelog for RPP
-Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/rpp/en/latest/).
+Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest)
-### RPP 1.8.0 (unreleased)
+## (Unreleased) RPP 1.8.0
### Changes
@@ -25,7 +25,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* CMake - Version `3.22.3`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-### RPP 1.5.0
+### RPP 1.5.0 for ROCm 6.1.1
### Changes
@@ -42,7 +42,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* CMake - Version `3.22.3`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 1.4.0
+## RPP 1.4.0 for ROCm 6.0.0
### Additions
@@ -76,7 +76,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* CMake - Version `3.22.3`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 1.3.0
+## RPP 1.3.0 for ROCm 5.7.1
### Additions
@@ -106,7 +106,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* Boost - Version `1.72`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 1.2.0
+## RPP 1.2.0 for ROCm 5.7.1
### Additions
@@ -137,7 +137,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* Boost - Version `1.72`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 1.1.0
+## RPP 1.1.0 for ROCm 5.7.0
### Additions
@@ -172,7 +172,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* SLES - the Clang package is missing in the latest updates, which means Clang must be manually
installed.
-## RPP 1.0.0
+## RPP 1.0.0 for ROCm 5.7.0
### Additions
@@ -212,7 +212,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* SLES - the Clang package is missing in the latest updates, which means Clang must be manually
installed.
-## RPP 0.99
+## RPP 0.99 for ROCm 5.7.0
### Additions
@@ -241,7 +241,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* Boost - Version `1.72`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 0.98
+## RPP 0.98 for ROCm 5.7.0
### Additions
* Dockers
@@ -251,11 +251,11 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* Readme updates
-#### Changes
+### Changes
* CMakeList
-#### Fixes
+### Fixes
* Minor bugs and warnings
@@ -270,7 +270,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* Boost - Version `1.72`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 0.97
+## RPP 0.97 for ROCm 5.7.0
### Additions
@@ -301,7 +301,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* Boost - Version `1.72`
* IEEE 754-based half-precision floating-point library - Version `1.12.0`
-## RPP 0.96
+## RPP 0.96 for ROCm 5.7.0
### Additions
@@ -334,7 +334,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* RPP is not supported on CentOS 7 and SLES SP2
-## RPP 0.95
+## RPP 0.95 for ROCm 5.7.0
### Additions
@@ -368,7 +368,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r
* ROCm reorganization: install updates no longer match ROCm specifications
-## RPP 0.93
+## RPP 0.93 for ROCm 5.7.0
### Additions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7963ff864..df233e5dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,9 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
# RPP Default Options
set(DEFAULT_BUILD_TYPE "Release")
+### RPP_AUDIO_SUPPORT - default = ON, NOTE: support currently only on Ubuntu - user to set to OFF otherwise
+option(RPP_AUDIO_SUPPORT "Build RPP with Audio Support" ON)
+option(BUILD_WITH_AMD_ADVANCE "Build RPP for advanced AMD GPU Architecture" OFF)
# Set message options
if(NOT WIN32)
@@ -77,6 +80,7 @@ endif()
if(APPLE)
set(CMAKE_MACOSX_RPATH 1)
set(BACKEND "CPU")
+ set(RPP_AUDIO_SUPPORT OFF)
message("-- ${Magenta}Apple macOS Detected -- GPU Support turned OFF${ColourReset}")
endif()
@@ -134,9 +138,16 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(HALF REQUIRED)
include_directories(${HALF_INCLUDE_DIRS})
+if (RPP_AUDIO_SUPPORT)
+ add_definitions(-DAUDIO_SUPPORT) # For compile flags in RPP
+ set(RPP_AUDIO_AUGMENTATIONS_SUPPORT 1) # For cmakedefine01 in rpp_audio_augmentations_support.h.in
+endif()
+
message("-- ${Cyan}RPP Developer Options${ColourReset}")
message("-- ${Cyan} -D BACKEND=${BACKEND} [Select RPP Backend [options:CPU/OPENCL/HIP](default:HIP)]${ColourReset}")
message("-- ${Cyan} -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} [Select RPP build type [options:Debug/Release](default:Release)]${ColourReset}")
+message("-- ${Cyan} -D RPP_AUDIO_SUPPORT=${RPP_AUDIO_SUPPORT} [Select RPP audio support [options:ON/OFF](default:ON)]${ColourReset}")
+message("-- ${Cyan} -D BUILD_WITH_AMD_ADVANCE=${BUILD_WITH_AMD_ADVANCE} [Turn ON/OFF Build for AMD advanced GPUs(default:OFF)]${ColourReset}")
# OpenMP
find_package(OpenMP REQUIRED)
@@ -207,8 +218,23 @@ if("${BACKEND}" STREQUAL "HIP")
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip)
+
+ # Set supported GPU Targets
set(DEFAULT_AMDGPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102")
- set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+ if (BUILD_WITH_AMD_ADVANCE)
+ set(DEFAULT_AMDGPU_TARGETS ${DEFAULT_AMDGPU_TARGETS} "gfx1200;gfx1201")
+ endif()
+
+ # Set AMDGPU_TARGETS
+ if(DEFINED ENV{AMDGPU_TARGETS})
+ set(AMDGPU_TARGETS $ENV{AMDGPU_TARGETS} CACHE STRING "List of specific machine types for library to target")
+ elseif(AMDGPU_TARGETS)
+ message("-- ${White}${PROJECT_NAME} -- AMDGPU_TARGETS set with -D option${ColourReset}")
+ else()
+ set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+ endif()
+ message("-- ${White}${PROJECT_NAME} -- AMDGPU_TARGETS: ${AMDGPU_TARGETS}${ColourReset}")
+
find_package(HIP QUIET)
if(HIP_FOUND)
message("-- ${White}${PROJECT_NAME} -- Using HIP - Path:" ${HIP_PATH} "\tVersion:" ${HIP_VERSION} "\tCompiler:" ${HIP_COMPILER}${ColourReset})
@@ -273,6 +299,7 @@ if("${BACKEND}" STREQUAL "CPU")
endif()
configure_file("${PROJECT_SOURCE_DIR}/include/rpp_backend.h.in" "${PROJECT_BINARY_DIR}/include/rpp_backend.h")
+configure_file("${PROJECT_SOURCE_DIR}/include/rpp_audio_augmentations_support.h.in" "${PROJECT_BINARY_DIR}/include/rpp_audio_augmentations_support.h")
# Enable SIMD for HOST code (in both OpenCL and HIP backends)
if(NOT DEFINED SIMD_ENABLE)
@@ -294,7 +321,12 @@ message("-- ${White}${PROJECT_NAME} -- Link Libraries: ${LINK_LIBRARY_LIST}${Col
target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST})
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a)
+if(RPP_AUDIO_SUPPORT)
+ target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a)
+ message("-- ${Green}${PROJECT_NAME} set to build with RPP_AUDIO_SUPPORT${ColourReset}")
+else()
+ message("-- ${Yellow}${PROJECT_NAME} set to build without RPP_AUDIO_SUPPORT${ColourReset}")
+endif()
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR})
target_include_directories(${PROJECT_NAME}
@@ -335,6 +367,9 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTA
install(FILES ${PROJECT_BINARY_DIR}/include/rpp_backend.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rpp
COMPONENT dev)
+install(FILES ${PROJECT_BINARY_DIR}/include/rpp_audio_augmentations_support.h
+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rpp
+ COMPONENT dev)
# install Test
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test)
diff --git a/docs/data/doxygenInputs/lens_img640x480.png b/docs/data/doxygenInputs/lens_img640x480.png
new file mode 100644
index 000000000..897955d77
Binary files /dev/null and b/docs/data/doxygenInputs/lens_img640x480.png differ
diff --git a/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png b/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png
new file mode 100644
index 000000000..d4d5b749b
Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png differ
diff --git a/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png
new file mode 100644
index 000000000..8aef1cbe6
Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png differ
diff --git a/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png b/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png
new file mode 100644
index 000000000..63a52819d
Binary files /dev/null and b/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png differ
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 18d9a73bc..9773637df 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -962,14 +962,16 @@ INPUT = ../../README.md \
../../include/rppi_logical_operations.h \
../../include/rppi_morphological_transforms.h \
../../include/rppi_statistical_operations.h \
+ ../../include/rppt_tensor_arithmetic_operations.h \
+ ../../include/rppt_tensor_audio_augmentations.h \
../../include/rppt_tensor_color_augmentations.h \
../../include/rppt_tensor_data_exchange_operations.h \
../../include/rppt_tensor_effects_augmentations.h \
../../include/rppt_tensor_filter_augmentations.h \
../../include/rppt_tensor_geometric_augmentations.h \
+ ../../include/rppt_tensor_logical_operations.h \
../../include/rppt_tensor_morphological_operations.h \
- ../../include/rppt_tensor_statistical_operations.h \
- ../../include/rppt_tensor_logical_operations.h
+ ../../include/rppt_tensor_statistical_operations.h
# This tag can be used to specify the character encoding of the source files
@@ -2381,7 +2383,7 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE
+PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index 221c93045..c316de276 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1 @@
-rocm-docs-core[api_reference]==1.4.0
+rocm-docs-core[api_reference]==1.5.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 8d0f37727..2c9286b18 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -110,7 +110,7 @@ requests==2.28.2
# via
# pygithub
# sphinx
-rocm-docs-core[api-reference]==1.4.0
+rocm-docs-core[api-reference]==1.5.1
# via -r requirements.in
smmap==5.0.0
# via gitdb
diff --git a/include/rpp_audio_augmentations_support.h.in b/include/rpp_audio_augmentations_support.h.in
new file mode 100644
index 000000000..6e8e8c66f
--- /dev/null
+++ b/include/rpp_audio_augmentations_support.h.in
@@ -0,0 +1,6 @@
+#ifndef GUARD_RPP_AUDIO_AUGMENTATIONS_SUPPORT_H_IN
+#define GUARD_RPP_AUDIO_AUGMENTATIONS_SUPPORT_H_IN
+
+#cmakedefine01 RPP_AUDIO_AUGMENTATIONS_SUPPORT
+
+#endif
\ No newline at end of file
diff --git a/include/rppdefs.h b/include/rppdefs.h
index 28876d7f5..6eb025665 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -64,6 +64,7 @@ SOFTWARE.
const float ONE_OVER_6 = 1.0f / 6;
const float ONE_OVER_3 = 1.0f / 3;
const float ONE_OVER_255 = 1.0f / 255;
+const uint MMS_MAX_SCRATCH_MEMORY = 76800000; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training
/******************** RPP typedefs ********************/
@@ -136,7 +137,15 @@ typedef enum
/*! \brief src and dst layout mismatch \ingroup group_rppdefs */
RPP_ERROR_LAYOUT_MISMATCH = -18,
/*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
- RPP_ERROR_INVALID_CHANNELS = -19
+ RPP_ERROR_INVALID_CHANNELS = -19,
+ /*! \brief Invalid output tile length (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH = -20,
+ /*! \brief Shared memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE = -21,
+ /*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE = -22,
+ /*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_INVALID_SRC_DIMS = -23
} RppStatus;
/*! \brief RPP rppStatus_t type enums
@@ -446,6 +455,16 @@ typedef struct
} RpptRoiLtrb;
+/*! \brief RPPT Tensor Channel Offsets struct
+ * \ingroup group_rppdefs
+ */
+typedef struct
+{
+ RppiPoint r;
+ RppiPoint g;
+ RppiPoint b;
+} RpptChannelOffsets;
+
/*! \brief RPPT Tensor 3D ROI LTFRBB struct
* \ingroup group_rppdefs
*/
diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index d34bdd1dd..d091f50ba 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -47,8 +47,8 @@ extern "C" {
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HOST memory
@@ -70,8 +70,8 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPt
* It multiplies each element of the source tensor by a corresponding element in the 'mulTensor',
* adds a corresponding element from the 'addTensor', and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HIP memory
@@ -92,8 +92,8 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr
* \details This function performs the addition operation on a batch of 4D tensors.
* It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_add_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HOST memory
@@ -113,8 +113,8 @@ RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
* \details This function performs the addition operation on a batch of 4D tensors.
* It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_add_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_add_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HIP memory
@@ -134,8 +134,8 @@ RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc
* \details This function performs the subtraction operation on a batch of 4D tensors.
* It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HOST memory
@@ -155,8 +155,8 @@ RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGener
* \details This function performs the subtraction operation on a batch of 4D tensors.
* It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HIP memory
@@ -176,8 +176,8 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
* \details This function performs the multiplication operation on a batch of 4D tensors.
* It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HOST memory
@@ -190,15 +190,15 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
-RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Multiply scalar augmentation on HIP backend
* \details This function performs the multiplication operation on a batch of 4D tensors.
* It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor.
* Support added for f32 -> f32 dataype.
- * \image html input150x150x4.gif Sample Input
- * \image html arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param[in] srcGenericDescPtr source tensor descriptor
* \param[out] dstPtr destination tensor in HIP memory
@@ -226,7 +226,7 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -248,7 +248,7 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -258,6 +258,40 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Logarithm operation on HOST backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Logarithm operation on HIP backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index f6349ae95..db52b073f 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -25,6 +25,8 @@ SOFTWARE.
#ifndef RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
#define RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
+#ifdef AUDIO_SUPPORT
+
#include "rpp.h"
#include "rppdefs.h"
#ifdef __cplusplus
@@ -46,49 +48,90 @@ extern "C" {
* \details Non Silent Region Detection augmentation for 1D audio buffer
\n Finds the starting index and length of non silent region in the audio buffer by comparing the
calculated short-term power with cutoff value passed
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
- * \param[out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize)
- * \param[out] detectionLengthTensor length of non silent region (1D tensor in HOST memory, of size batchSize)
- * \param[in] cutOffDB cutOff in dB below which the signal is considered silent
- * \param[in] windowLength window length used for computing short-term power of the signal
- * \param[in] referencePower reference power that is used to convert the signal to dB
- * \param[in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
- * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
+ * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize)
+ * \param [out] detectionLengthTensor length of non silent region (1D tensor in HOST memory, of size batchSize)
+ * \param [in] cutOffDB cutOff in dB below which the signal is considered silent
+ * \param [in] windowLength window length used for computing short-term power of the signal
+ * \param [in] referencePower reference power that is used to convert the signal to dB
+ * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+#ifdef GPU_SUPPORT
+/*! \brief Non Silent Region Detection augmentation on HIP backend
+ * \details Non Silent Region Detection augmentation for 1D audio buffer
+ \n Finds the starting index and length of non silent region in the audio buffer by comparing the
+ calculated short-term power with cutoff value passed
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [out] detectionLengthTensor length of non silent region (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [in] cutOffDB cutOff in dB below which the signal is considered silent
+ * \param [in] windowLength window length used for computing short-term power of the signal
+ * \param [in] referencePower reference power that is used to convert the signal to dB
+ * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief To Decibels augmentation on HOST backend
- * \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2)
- * \param[in] cutOffDB minimum or cut-off ratio in dB
- * \param[in] multiplier factor by which the logarithm is multiplied
- * \param[in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
- * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2)
+ * \param [in] cutOffDB minimum or cut-off ratio in dB
+ * \param [in] multiplier factor by which the logarithm is multiplied
+ * \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle);
+#ifdef GPU_SUPPORT
+/*! \brief To Decibels augmentation on HIP backend
+ * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcDims source tensor sizes for each element in batch (2D tensor in Pinned/HIP memory, of size batchSize * 2)
+ * \param [in] cutOffDB minimum or cut-off ratio in dB
+ * \param [in] multiplier factor by which the logarithm is multiplied
+ * \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Pre Emphasis Filter augmentation on HOST backend
* \details Pre Emphasis Filter augmentation for audio data
- * \param[in] srcPtr source tensor in HOST memory
- * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[out] dstPtr destination tensor in HOST memory
- * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
- * \param[in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
- * \param[in] coeffTensor preemphasis coefficient (1D tensor in HOST memory, of size batchSize)
- * \param[in] borderType border value policy
- * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize)
+ * \param [in] coeffTensor preemphasis coefficient (1D tensor in HOST memory, of size batchSize)
+ * \param [in] borderType border value policy
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
@@ -97,19 +140,36 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
/*! \brief Down Mixing augmentation on HOST backend
* \details Down Mixing augmentation for audio data
-* \param[in] srcPtr source tensor in HOST memory
-* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[out] dstPtr destination tensor in HOST memory
-* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
-* \param[in] normalizeWeights bool flag to specify if normalization of weights is needed
-* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+* \param [in] srcPtr source tensor in HOST memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HOST memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param [in] normalizeWeights bool flag to specify if normalization of weights is needed
+* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
+#ifdef GPU_SUPPORT
+/*! \brief Down Mixing augmentation on HIP backend
+* \details Down Mixing augmentation for audio data
+* \param [in] srcPtr source tensor in HIP memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HIP memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HIP/Pinned memory, of size batchSize * 2)
+* \param [in] normalizeWeights bool flag to specify if normalization of weights is needed
+* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+* \return A \ref RppStatus enumeration.
+* \retval RPP_SUCCESS Successful completion.
+* \retval RPP_ERROR* Unsuccessful completion.
+*/
+RppStatus rppt_down_mixing_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Produces a spectrogram from a 1D audio buffer on HOST backend
* \details Spectrogram for 1D audio buffer
* \param [in] srcPtr source tensor in HOST memory
@@ -153,15 +213,15 @@ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
/*! \brief Resample augmentation on HOST backend
* \details Resample augmentation for audio data
-* \param[in] srcPtr source tensor in HOST memory
-* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[out] dstPtr destination tensor in HOST memory
-* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize)
-* \param[in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize)
-* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
-* \param[in] window Resampling window (struct of type RpptRpptResamplingWindow)
-* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+* \param [in] srcPtr source tensor in HOST memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HOST memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param [in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize)
+* \param [in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param [in] window Resampling window (struct of type RpptRpptResamplingWindow)
+* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
@@ -174,4 +234,7 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
#ifdef __cplusplus
}
#endif
+
+#endif // AUDIO_SUPPORT
+
#endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H
diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h
index b01a12dca..62ef13715 100644
--- a/include/rppt_tensor_color_augmentations.h
+++ b/include/rppt_tensor_color_augmentations.h
@@ -54,7 +54,7 @@ extern "C" {
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch)
* \param [in] betaTensor beta values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -76,7 +76,7 @@ RppStatus rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch)
* \param [in] betaTensor beta values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -97,7 +97,7 @@ RppStatus rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in HOST memory, of size batchSize with gamma >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -118,7 +118,7 @@ RppStatus rppt_gamma_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rp
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in pinned/HOST memory, of size batchSize with gamma >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -141,7 +141,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for alpha-blending (1D tensor in HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -164,7 +164,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for alpha-blending (1D tensor in pinned/HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -188,7 +188,7 @@ RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc
* \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
* \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
* \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -212,7 +212,7 @@ RppStatus rppt_color_twist_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
* \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
* \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
* \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -236,7 +236,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] contrastTensor contrast modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
* \param [in] hueTensor hue modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
* \param [in] saturationTensor saturation modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -257,7 +257,7 @@ RppStatus rppt_color_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch)
* \param [in] alphaTensor alpha values for color casting calculation (1D tensor in HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -279,7 +279,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in pinned/HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch)
* \param [in] alphaTensor alpha values for color casting calculation (1D tensor in pinned/HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -300,7 +300,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -321,7 +321,7 @@ RppStatus rppt_exposure_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in pinned/HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -343,7 +343,7 @@ RppStatus rppt_exposure_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch))
* \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -365,7 +365,7 @@ RppStatus rppt_contrast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch))
* \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -386,7 +386,7 @@ RppStatus rppt_contrast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] lutPtr lut Array in HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -407,7 +407,7 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] lutPtr lut Array in pinned/HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -427,15 +427,15 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr,
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
-RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp8s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Color Temperature augmentation on HIP backend for a NCHW/NHWC layout tensor
@@ -448,8 +448,8 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h
index 708f318bf..a4c2b41ba 100644
--- a/include/rppt_tensor_effects_augmentations.h
+++ b/include/rppt_tensor_effects_augmentations.h
@@ -56,7 +56,7 @@ extern "C" {
* \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch)
* \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch)
* \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -80,7 +80,7 @@ RppStatus rppt_gridmask_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch)
* \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch)
* \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -103,7 +103,7 @@ RppStatus rppt_gridmask_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -126,7 +126,7 @@ RppStatus rppt_spatter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -151,7 +151,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] saltValueTensor A user-defined salt noise value (1D tensor in HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch)
* \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -176,7 +176,7 @@ RppStatus rppt_salt_and_pepper_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt
* \param [in] saltValueTensor A user-defined salt noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch)
* \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -198,7 +198,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -220,7 +220,7 @@ RppStatus rppt_shot_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in pinned/HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -243,7 +243,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -266,7 +266,7 @@ RppStatus rppt_gaussian_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP
* \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -289,7 +289,7 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -312,7 +312,7 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -338,7 +338,7 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDesc
* \param[in] freqYTensor freqY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -364,7 +364,7 @@ RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param[in] freqYTensor freqY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -433,7 +433,7 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -455,7 +455,7 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -465,11 +465,55 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *vignetteIntensityTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/******************** jitter ********************/
+
+/*! \brief Jitter augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input
+ * \image html effects_augmentations_jitter_img150x150.png Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Jitter augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input
+ * \image html effects_augmentations_jitter_img150x150.png Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param un[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Gaussian noise augmentation on HOST backend
* \details This function adds gaussian noise to a batch of 4D tensors.
* Support added for u8 -> u8, f32 -> f32 datatypes.
- * \image html input150x150x4.gif Sample Input
- * \image html effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcGenericDescPtr source tensor descriptor
* \param [out] dstPtr destination tensor in HOST memory
@@ -490,8 +534,8 @@ RppStatus rppt_gaussian_noise_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr src
/*! \brief Gaussian noise augmentation on HIP backend
* \details This function adds gaussian noise to a batch of 4D tensors.
* Support added for u8 -> u8, f32 -> f32 datatypes.
- * \image html input150x150x4.gif Sample Input
- * \image html effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/effects_augmentations_gaussian_noise_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcGenericDescPtr source tensor descriptor
* \param [out] dstPtr destination tensor in HIP memory
@@ -524,7 +568,7 @@ RppStatus rppt_gaussian_noise_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcD
- Erase-region anchor boxes on each image given by the user must not overlap
* \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr)
* \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -549,7 +593,7 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
- Erase-region anchor boxes on each image given by the user must not overlap
* \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr)
* \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -559,6 +603,50 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptRoiLtrb *anchorBoxInfoTensor, RppPtr_t colorsTensor, Rpp32u *numBoxesTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Glitch augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html effects_augmentations_glitch_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A single set of 3 Rppi point values that applies to all images in the batch.
+ * For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height)
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Glitch augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.jpg Sample Input
+ * \image html effects_augmentations_glitch_img150x150.jpg Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A 1D tensor in pinned/HOST memory contains single set of 3 Rppi point values that applies to all images in the batch.
+ * For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height)
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/include/rppt_tensor_filter_augmentations.h b/include/rppt_tensor_filter_augmentations.h
index 7ea8d00c6..992631c49 100644
--- a/include/rppt_tensor_filter_augmentations.h
+++ b/include/rppt_tensor_filter_augmentations.h
@@ -57,7 +57,7 @@ extern "C" {
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -83,7 +83,7 @@ RppStatus rppt_box_filter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for gaussian calculation (1D tensor in pinned/HOST memory, of size batchSize, for each image in batch)
* \param [in] kernelSize kernel size for gaussian filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h
index a3e6d2d7f..28dd516e6 100644
--- a/include/rppt_tensor_geometric_augmentations.h
+++ b/include/rppt_tensor_geometric_augmentations.h
@@ -52,7 +52,7 @@ extern "C" {
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -72,7 +72,7 @@ RppStatus rppt_crop_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -95,7 +95,7 @@ RppStatus rppt_crop_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
* \param [in] offsetTensor offset values for normalization (1D tensor in HOST memory, of size batchSize, with offsetTensor[n] <= 0)
* \param [in] multiplierTensor multiplier values for normalization (1D tensor in HOST memory, of size batchSize, with multiplierTensor[n] > 0)
* \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -118,7 +118,7 @@ RppStatus rppt_crop_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt
* \param [in] offsetTensor offset values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with offsetTensor[n] <= 0)
* \param [in] multiplierTensor multiplier values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with multiplierTensor[n] > 0)
* \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -140,7 +140,7 @@ RppStatus rppt_crop_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in HOST memory, of size batchSize * 6 for each image in batch)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -162,7 +162,7 @@ RppStatus rppt_warp_affine_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in pinned/HOST memory, of size batchSize * 6 for each image in batch)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -184,7 +184,7 @@ RppStatus rppt_warp_affine_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -206,7 +206,7 @@ RppStatus rppt_flip_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -228,7 +228,7 @@ RppStatus rppt_flip_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -250,7 +250,7 @@ RppStatus rppt_resize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -275,7 +275,7 @@ RppStatus rppt_resize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images))
* \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -300,7 +300,7 @@ RppStatus rppt_resize_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDesc
* \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images))
* \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -323,7 +323,7 @@ RppStatus rppt_resize_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescP
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -346,7 +346,7 @@ RppStatus rppt_resize_crop_mirror_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -368,7 +368,7 @@ RppStatus rppt_resize_crop_mirror_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -390,7 +390,7 @@ RppStatus rppt_rotate_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in pinned/HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -412,7 +412,7 @@ RppStatus rppt_rotate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -434,7 +434,7 @@ RppStatus rppt_phase_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -541,8 +541,8 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt
/*! \brief Flip voxel augmentation HOST
* \details The flip voxel augmentation performs a mask-controlled horizontal/vertical/depth flip on a generic 4D tensor.
Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \image html input150x150x4.gif Sample Input
- * \image html geometric_augmentations_flip_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/geometric_augmentations_flip_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HOST memory
* \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
@@ -564,8 +564,8 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes
/*! \brief Flip voxel augmentation GPU
* \details The flip voxel augmentation performs a mask-controlled horizontal/vertical/depth flip on a generic 4D tensor.
Support added for f32 -> f32 and u8 -> u8 dataypes.
- * \image html input150x150x4.gif Sample Input
- * \image html geometric_augmentations_flip_150x150x4.gif Sample Output
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input
+ * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/geometric_augmentations_flip_150x150x4.gif Sample Output
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
@@ -598,7 +598,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc
* \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc)
* \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -623,7 +623,7 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc)
* \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -634,6 +634,94 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
RppStatus rppt_remap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, RpptInterpolationType interpolationType, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Lens correction transformation on HOST backend for a NCHW/NHWC layout tensor
+ * \details Performs lens correction transforms on an image to compensate barrel lens distortion of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * Note: Returns a black image if the passed camera matrix has a 0 determinant
+ * \image html lens_img640x480.png Sample Input
+ * \image html geometric_augmentations_lens_correction_img_640x480.png Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rowRemapTable Rpp32f row numbers in HOST memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
+ * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize)
+ * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_lens_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, Rpp32f *cameraMatrixTensor, Rpp32f *distortionCoeffsTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Lens correction transformation on HIP backend for a NCHW/NHWC layout tensor
+ * \details Performs lens correction transforms on an image to compensate barrel lens distortion of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * Note: Returns a black image if the passed camera matrix has a 0 determinant
+ * \image html lens_img640x480.png Sample Input
+ * \image html geometric_augmentations_lens_correction_img_640x480.png Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] rowRemapTable Rpp32f row numbers in HIP memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize)
+ * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
+ * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize)
+ * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_lens_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, Rpp32f *cameraMatrixTensor, Rpp32f *distortionCoeffsTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Transpose Generic augmentation on HOST backend
+ * \details The transpose augmentation performs an input-permutation based transpose on a generic ND Tensor.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr source tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] permTensor permutation tensor for transpose operation
+ * \param [in] roiTensor ROI data for each element in source tensor (tensor of batchSize * number of dimensions * 2 values)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_transpose_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *permTensor, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Transpose Generic augmentation on HIP backend
+ * \details The transpose augmentation performs an input-permutation based transpose on a generic ND Tensor.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr source tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] permTensor permutation tensor for transpose operation in pinned memory
+ * \param [in] roiTensor ROI data for each element in source tensor (tensor of batchSize * number of dimensions * 2 values)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ * \ingroup group_tensor_geometric
+ */
+RppStatus rppt_transpose_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *permTensor, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h
index 3a4685167..28dff69ce 100644
--- a/include/rppt_tensor_logical_operations.h
+++ b/include/rppt_tensor_logical_operations.h
@@ -54,7 +54,7 @@ extern "C" {
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -76,7 +76,7 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -98,7 +98,7 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -120,7 +120,7 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -136,4 +136,4 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
diff --git a/include/rppt_tensor_morphological_operations.h b/include/rppt_tensor_morphological_operations.h
index eb879af5c..126c4757a 100644
--- a/include/rppt_tensor_morphological_operations.h
+++ b/include/rppt_tensor_morphological_operations.h
@@ -57,7 +57,7 @@ extern "C" {
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -82,7 +82,7 @@ RppStatus rppt_erode_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -98,4 +98,4 @@ RppStatus rppt_dilate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 441816ea3..ca464340b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -50,7 +50,7 @@ extern "C" {
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorSumArr destination array in HOST memory
* \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -68,7 +68,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorSumArr destination array in HIP memory
* \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -86,7 +86,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] minArr destination array in HOST memory
* \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -104,7 +104,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] minArr destination array in HIP memory
* \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -122,7 +122,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] maxArr destination array in HOST memory
* \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -140,7 +140,7 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] maxArr destination array in HIP memory
* \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -201,7 +201,7 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorMeanArr destination array in HOST memory
* \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -219,7 +219,7 @@ RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorMeanArr destination array in HIP memory
* \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -238,7 +238,7 @@ RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [out] tensorStddevArr destination array in HOST memory
* \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
* \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -257,7 +257,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
* \param [out] tensorStddevArr destination array in HIP memory
* \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
* \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -273,4 +273,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index bb06713b9..be8eaeeaa 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -177,6 +177,21 @@ struct RPPTensorFunctionMetaData
};
#endif // GPU_SUPPORT
+// Computes strides for ND Tensor
+inline void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
+{
+ if (tensorDim > 0)
+ {
+ Rpp32u v = 1;
+ for (Rpp32u i = tensorDim - 1; i > 0; i--)
+ {
+ strides[i] = v;
+ v *= shape[i];
+ }
+ strides[0] = v;
+ }
+}
+
// Uses fast inverse square root algorithm from Lomont, C., 2003. FAST INVERSE SQUARE ROOT. [online] lomont.org. Available at:
inline float rpp_host_math_inverse_sqrt_1(float x)
{
@@ -6096,6 +6111,25 @@ inline void compute_separable_horizontal_resample(Rpp32f *inputPtr, T *outputPtr
}
}
+inline void compute_jitter_src_loc_avx(__m256i *pxXorwowStateX, __m256i *pxXorwowStateCounter, __m256 &pRow, __m256 &pCol, __m256 &pKernelSize, __m256 &pBound, __m256 &pHeightLimit, __m256 &pWidthLimit, __m256 &pStride, __m256 &pChannel, Rpp32s *srcLoc)
+{
+ __m256 pRngX = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter);
+ __m256 pRngY = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter);
+ __m256 pX = _mm256_mul_ps(pRngX, pKernelSize);
+ __m256 pY = _mm256_mul_ps(pRngY, pKernelSize);
+ pX = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pRow, _mm256_sub_ps(pX, pBound))), pHeightLimit), avx_p0);
+ pY = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pCol, _mm256_sub_ps(pY, pBound))), pWidthLimit), avx_p0);
+ __m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pX, pStride, _mm256_mul_ps(pY, pChannel)));
+ _mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc);
+}
+
+inline void compute_jitter_src_loc(RpptXorwowStateBoxMuller *xorwowState, Rpp32s row, Rpp32s col, Rpp32s kSize, Rpp32s heightLimit, Rpp32s widthLimit, Rpp32s stride, Rpp32s bound, Rpp32s channels, Rpp32s &loc)
+{
+ Rpp32u heightIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize;
+ Rpp32u widthIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize;
+ loc = std::max(std::min(static_cast(row + heightIncrement - bound), heightLimit), 0) * stride;
+ loc += std::max(std::min(static_cast(col + widthIncrement - bound), (widthLimit - 1)), 0) * channels;
+}
inline void compute_sum_16_host(__m256i *p, __m256i *pSum)
{
pSum[0] = _mm256_add_epi32(_mm256_add_epi32(p[0], p[1]), pSum[0]); //add 16 values to 8
@@ -6501,4 +6535,32 @@ inline void compute_remap_src_loc(Rpp32f rowLoc, Rpp32f colLoc, Rpp32s &srcLoc,
srcLoc = (rowLoc * stride) + colLoc * channels;
}
+inline void compute_log_16_host(__m256 *p)
+{
+ p[0] = log_ps(p[0]); // log compute
+ p[1] = log_ps(p[1]); // log compute
+}
+
+inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst)
+{
+ __m256 tmp0, tmp1, tmp2, tmp3;
+ tmp0 = _mm256_shuffle_ps(pSrc[0], pSrc[1], 0x44); /* shuffle to get [P01|P02|P09|P10|P05|P06|P13|P14] */
+ tmp2 = _mm256_shuffle_ps(pSrc[0], pSrc[1], 0xEE); /* shuffle to get [P03|P04|P11|P12|P07|P08|P15|P16] */
+ tmp1 = _mm256_shuffle_ps(pSrc[2], pSrc[3], 0x44); /* shuffle to get [P17|P18|P25|P26|P21|P22|P29|P30] */
+ tmp3 = _mm256_shuffle_ps(pSrc[2], pSrc[3], 0xEE); /* shuffle to get [P19|P20|P27|P28|P23|P24|P31|P32] */
+ pSrc[0] = _mm256_shuffle_ps(tmp0, tmp1, 0x88); /* shuffle to get [P01|P09|P17|P25|P05|P13|P21|P29] */
+ pSrc[1] = _mm256_shuffle_ps(tmp0, tmp1, 0xDD); /* shuffle to get [P02|P10|P18|P26|P06|P14|P22|P30] */
+ pSrc[2] = _mm256_shuffle_ps(tmp2, tmp3, 0x88); /* shuffle to get [P03|P11|P19|P27|P07|P15|P23|P31] */
+ pSrc[3] = _mm256_shuffle_ps(tmp2, tmp3, 0xDD); /* shuffle to get [P04|P12|P20|P28|P08|P16|P24|P32] */
+
+ pDst[0] = _mm256_castps256_ps128(pSrc[0]); /* extract [P01|P09|P17|P25] */
+ pDst[1] = _mm256_castps256_ps128(pSrc[1]); /* extract [P02|P10|P18|P26] */
+ pDst[2] = _mm256_castps256_ps128(pSrc[2]); /* extract [P03|P11|P19|P27] */
+ pDst[3] = _mm256_castps256_ps128(pSrc[3]); /* extract [P04|P12|P20|P28] */
+ pDst[4] = _mm256_extractf128_ps(pSrc[0], 1); /* extract [P05|P13|P21|P29] */
+ pDst[5] = _mm256_extractf128_ps(pSrc[1], 1); /* extract [P06|P14|P22|P30] */
+ pDst[6] = _mm256_extractf128_ps(pSrc[2], 1); /* extract [P07|P15|P23|P31] */
+ pDst[7] = _mm256_extractf128_ps(pSrc[3], 1); /* extract [P08|P16|P24|P32] */
+}
+
#endif //RPP_CPU_COMMON_H
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index babc6f55c..b9e79c146 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -185,6 +185,10 @@ const __m256i avx_pxShufflePkd = _mm256_setr_m128(xmm_pxStore4Pkd, xmm_pxStore4P
const __m128i xmm_pxMask00 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0, 1, 2, 3);
const __m128i xmm_pxMask04To11 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+const __m256i avx_pxMaskR = _mm256_setr_epi8(0, 0x80, 0x80, 3, 0x80, 0x80, 6, 0x80, 0x80, 9, 0x80, 0x80, 12, 0x80, 0x80, 15, 0x80, 0x80, 18, 0x80, 0x80, 21, 0x80, 0x80, 24, 0x80, 0x80, 27, 0x80, 0x80, 0x80, 0x80);
+const __m256i avx_pxMaskG = _mm256_setr_epi8(0x80, 1, 0x80, 0x80, 4, 0x80, 0x80, 7, 0x80, 0x80, 10, 0x80, 0x80, 13, 0x80, 0x80, 16, 0x80, 0x80, 19, 0x80, 0x80, 22, 0x80, 0x80, 25, 0x80, 0x80, 28, 0x80, 0x80, 0x80);
+const __m256i avx_pxMaskB = _mm256_setr_epi8(0x80, 0x80, 2, 0x80, 0x80, 5, 0x80, 0x80, 8, 0x80, 0x80, 11, 0x80, 0x80, 14, 0x80, 0x80, 17, 0x80, 0x80, 20, 0x80, 0x80, 23, 0x80, 0x80, 26, 0x80, 0x80, 29, 0x80, 0x80);
+
// Print helpers
inline void rpp_mm_print_epi8(__m128i vPrintArray)
@@ -1021,6 +1025,99 @@ inline void rpp_load48_u8pkd3_to_f32pln3_avx(Rpp8u *srcPtr, __m256 *p)
p[5] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMaskB), _mm_shuffle_epi8(px[3], xmm_pxMaskB))); /* Contains B09-16 */
}
+inline void rpp_glitch_load24_u8pkd3_to_f32pln3_avx(Rpp8u *srcPtr, __m256 *p, int *srcLocs)
+{
+ __m128i px[2];
+ px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0])); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need R01-04 */
+ px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need R05-08 */
+ p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskR), _mm_shuffle_epi8(px[1], xmm_pxMaskR))); /* Contains R01-08 */
+
+ px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1])); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need G01-04 */
+ px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need G05-08 */
+ p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskG), _mm_shuffle_epi8(px[1], xmm_pxMaskG))); /* Contains G01-08 */
+
+ px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2])); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need B01-04 */
+ px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need B05-08 */
+ p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskB), _mm_shuffle_epi8(px[1], xmm_pxMaskB))); /* Contains B01-08 */
+}
+
+inline void rpp_glitch_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p, int *srcLocs)
+{
+ __m128 p128[8];
+ Rpp32f *srcPtrTemp = srcPtr + srcLocs[0];
+ p[0] = _mm256_setr_ps(*srcPtrTemp, *(srcPtrTemp + 3), *(srcPtrTemp + 6), *(srcPtrTemp + 9),
+ *(srcPtrTemp + 12), *(srcPtrTemp + 15), *(srcPtrTemp + 18), *(srcPtrTemp + 21));
+ srcPtrTemp = srcPtr + srcLocs[1];
+ p[1] = _mm256_setr_ps(*(srcPtrTemp + 1), *(srcPtrTemp + 4), *(srcPtrTemp + 7), *(srcPtrTemp + 10),
+ *(srcPtrTemp + 13), *(srcPtrTemp + 16), *(srcPtrTemp + 19), *(srcPtrTemp + 22));
+ srcPtrTemp = srcPtr + srcLocs[2];
+ p[2] = _mm256_setr_ps(*(srcPtrTemp + 2), *(srcPtrTemp + 5), *(srcPtrTemp + 8), *(srcPtrTemp + 11),
+ *(srcPtrTemp + 14), *(srcPtrTemp + 17), *(srcPtrTemp + 20), *(srcPtrTemp + 23));
+}
+
+inline void rpp_glitch_load24_i8pkd3_to_f32pln3_avx(Rpp8s *srcPtr, __m256 *p, int *srcLocs)
+{
+ __m128i px[2];
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0]))); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need R01-04 */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need R05-08 */
+ p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskR), _mm_shuffle_epi8(px[1], xmm_pxMaskR))); /* Contains R01-08 */
+
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1]))); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need G01-04 */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need G05-08 */
+ p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskG), _mm_shuffle_epi8(px[1], xmm_pxMaskG))); /* Contains G01-08 */
+
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2]))); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need B01-04 */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need B05-08 */
+ p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskB), _mm_shuffle_epi8(px[1], xmm_pxMaskB))); /* Contains B01-08 */
+}
+
+inline void rpp_glitch_load30_u8pkd3_to_u8pkd3_avx(Rpp8u *srcPtr, int *srcLocs, __m256i &p)
+{
+ __m256i px[3];
+ px[0] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[0])); // Load the source location1 values passed
+ px[1] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[1])); // Load the source location2 values passed
+ px[2] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[2])); // Load the source location3 values passed
+ px[0] = _mm256_shuffle_epi8(px[0], avx_pxMaskR); /* Shuffle to obtain R channel values */
+ px[1] = _mm256_shuffle_epi8(px[1], avx_pxMaskG); /* Shuffle to obtain G channel values */
+ px[2] = _mm256_shuffle_epi8(px[2], avx_pxMaskB); /* Shuffle to obtain B channel values */
+ px[0] = _mm256_or_si256(px[0], px[1]); /* Pack R and G channels to obtain RG format */
+ p = _mm256_or_si256(px[0], px[2]); /* Pack RG values and B channel to obtain RGB format */
+}
+
+inline void rpp_glitch_load30_i8pkd3_to_i8pkd3_avx(Rpp8s *srcPtr, int * srcLocs, __m256i &p)
+{
+ __m256i px[3];
+ px[0] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[0])); // Load the source location1 values passed
+ px[1] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[1])); // Load the source location2 values passed
+ px[2] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[2])); // Load the source location3 values passed
+ px[0] = _mm256_shuffle_epi8(px[0], avx_pxMaskR); /* Shuffle to obtain R channel values */
+ px[1] = _mm256_shuffle_epi8(px[1], avx_pxMaskG); /* Shuffle to obtain G channel values */
+ px[2] = _mm256_shuffle_epi8(px[2], avx_pxMaskB); /* Shuffle to obtain B channel values */
+ px[0] = _mm256_or_si256(px[0], px[1]); /* Pack R and G channels to obtain RG format */
+ p = _mm256_or_si256(px[0], px[2]); /* Pack RG values and B channel to obtain RGB format */
+}
+
+inline void rpp_glitch_load6_f32pkd3_to_f32pkd3_avx(Rpp32f *srcPtr, int * srcLocs, __m256 &p)
+{
+ p =_mm256_setr_ps(*(srcPtr + srcLocs[0]), *(srcPtr + srcLocs[1] + 1), *(srcPtr + srcLocs[2] + 2), *(srcPtr + srcLocs[0] + 3),
+ *(srcPtr + srcLocs[1] + 4), *(srcPtr + srcLocs[2] + 5), 0.0f, 0.0f);
+}
+
+inline void rpp_glitch_load48_u8pln3_to_f32pln3_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256 *p, int *srcLocs)
+{
+ __m128i px[3];
+
+ px[0] = _mm_loadu_si128((__m128i *)srcPtrR + srcLocs[0]); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+ px[1] = _mm_loadu_si128((__m128i *)srcPtrG + srcLocs[1]); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+ px[2] = _mm_loadu_si128((__m128i *)srcPtrB + srcLocs[2]); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+ p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMask00To03), _mm_shuffle_epi8(px[0], xmm_pxMask04To07))); /* Contains R01-08 */
+ p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMask08To11), _mm_shuffle_epi8(px[0], xmm_pxMask12To15))); /* Contains R09-16 */
+ p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[1], xmm_pxMask00To03), _mm_shuffle_epi8(px[1], xmm_pxMask04To07))); /* Contains G01-08 */
+ p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[1], xmm_pxMask08To11), _mm_shuffle_epi8(px[1], xmm_pxMask12To15))); /* Contains G09-16 */
+ p[4] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMask00To03), _mm_shuffle_epi8(px[2], xmm_pxMask04To07))); /* Contains B01-08 */
+ p[5] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMask08To11), _mm_shuffle_epi8(px[2], xmm_pxMask12To15))); /* Contains B09-16 */
+}
+
inline void rpp_load48_u8pkd3_to_f32pln3_mirror_avx(Rpp8u *srcPtr, __m256 *p)
{
__m128i px[4];
@@ -3762,6 +3859,20 @@ inline void rpp_resize_nn_load_u8pkd3(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _
p = _mm_shuffle_epi8(px[0], xmm_pkd_mask); // Shuffle to obtain 4 RGB [R01|G01|B01|R11|G11|B11|R21|G21|B21|R31|G31|B31|00|00|00|00]
}
+template
+inline void rpp_resize_nn_extract_pkd3_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p)
+{
+ p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[0] + 1), *(srcRowPtrsForInterp + loc[0] + 2),
+ *(srcRowPtrsForInterp + loc[1]), *(srcRowPtrsForInterp + loc[1] + 1), *(srcRowPtrsForInterp + loc[1] + 2),
+ *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[2] + 1), *(srcRowPtrsForInterp + loc[2] + 2),
+ *(srcRowPtrsForInterp + loc[3]), *(srcRowPtrsForInterp + loc[3] + 1), *(srcRowPtrsForInterp + loc[3] + 2),
+ *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[4] + 1), *(srcRowPtrsForInterp + loc[4] + 2),
+ *(srcRowPtrsForInterp + loc[5]), *(srcRowPtrsForInterp + loc[5] + 1), *(srcRowPtrsForInterp + loc[5] + 2),
+ *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[6] + 1), *(srcRowPtrsForInterp + loc[6] + 2),
+ *(srcRowPtrsForInterp + loc[7]), *(srcRowPtrsForInterp + loc[7] + 1), *(srcRowPtrsForInterp + loc[7] + 2),
+ 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p)
{
__m128i px[4];
@@ -3774,6 +3885,16 @@ inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _
p = _mm_unpacklo_epi8(px[0], px[1]); // unpack to obtain [R01|R11|R21|R31|00|00|00|00|00|00|00|00|00|00|00|00]
}
+template
+inline void rpp_resize_nn_extract_pln1_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p)
+{
+ p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]),
+ *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]),
+ *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]),
+ *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]),
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 *p)
{
p[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]); // LOC0 load [R01|G01|B01|R02] - Need RGB 01
@@ -3783,6 +3904,42 @@ inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, R
_MM_TRANSPOSE4_PS(p[0], p[1], p[2], pTemp); // Transpose to obtain RGB in each vector
}
+inline void rpp_resize_nn_load_f32pkd3_to_f32pln3_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p)
+{
+ __m128 p128[8];
+ p128[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]);
+ p128[1] = _mm_loadu_ps(srcRowPtrsForInterp + loc[1]);
+ p128[2] = _mm_loadu_ps(srcRowPtrsForInterp + loc[2]);
+ p128[3] = _mm_loadu_ps(srcRowPtrsForInterp + loc[3]);
+ _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]);
+ p128[4] = _mm_loadu_ps(srcRowPtrsForInterp + loc[4]);
+ p128[5] = _mm_loadu_ps(srcRowPtrsForInterp + loc[5]);
+ p128[6] = _mm_loadu_ps(srcRowPtrsForInterp + loc[6]);
+ p128[7] = _mm_loadu_ps(srcRowPtrsForInterp + loc[7]);
+ _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]);
+ p[0] = _mm256_setr_m128(p128[0], p128[4]);
+ p[1] = _mm256_setr_m128(p128[1], p128[5]);
+ p[2] = _mm256_setr_m128(p128[2], p128[6]);
+}
+
+inline void rpp_resize_nn_load_f16pkd3_to_f32pln3_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p)
+{
+ p[0] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7]));
+
+ p[1] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 1),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 1),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 1),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 1));
+
+ p[2] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 2),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 2),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 2),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 2));
+}
+
inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 &p)
{
__m128 pTemp[4];
@@ -3795,6 +3952,22 @@ inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc,
p = _mm_unpacklo_ps(pTemp[0], pTemp[1]); // Unpack to obtain [R01|R11|R21|R31]
}
+inline void rpp_resize_nn_load_f32pln1_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p)
+{
+ p = _mm256_setr_ps(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]),
+ *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]),
+ *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]),
+ *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]));
+}
+
+inline void rpp_resize_nn_load_f16pln1_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p)
+{
+ p = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7]));
+}
+
inline void rpp_resize_nn_load_i8pkd3(Rpp8s *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p)
{
__m128i px[4];
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index 3f32dbc04..721800c80 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -55,7 +55,7 @@ typedef union { float f1[5];
typedef union { float f1[6]; float2 f2[3]; } d_float6;
typedef union { float f1[7]; } d_float7;
typedef union { float f1[8]; float2 f2[4]; float4 f4[2]; } d_float8;
-typedef union { float f1[9]; } d_float9;
+typedef union { float f1[9]; float3 f3[3]; } d_float9;
typedef union { float f1[12]; float4 f4[3]; } d_float12;
typedef union { float f1[16]; float4 f4[4]; d_float8 f8[2]; } d_float16;
typedef union { float f1[24]; float2 f2[12]; float3 f3[8]; float4 f4[6]; d_float8 f8[3]; } d_float24;
@@ -1776,6 +1776,22 @@ __device__ __forceinline__ void rpp_hip_math_multiply24_const(d_float24 *src_f24
dst_f24->f4[5] = src_f24->f4[5] * multiplier_f4;
}
+// d_float8 divide
+
+__device__ __forceinline__ void rpp_hip_math_divide8(d_float8 *src1Ptr_f8, d_float8 *src2Ptr_f8, d_float8 *dstPtr_f8)
+{
+ dstPtr_f8->f4[0] = src1Ptr_f8->f4[0] / src2Ptr_f8->f4[0];
+ dstPtr_f8->f4[1] = src1Ptr_f8->f4[1] / src2Ptr_f8->f4[1];
+}
+
+// d_float8 divide with constant
+
+__device__ __forceinline__ void rpp_hip_math_divide8_const(d_float8 *src_f8, d_float8 *dst_f8, float4 divisor_f4)
+{
+ dst_f8->f4[0] = divisor_f4 / src_f8->f4[0];
+ dst_f8->f4[1] = divisor_f4 / src_f8->f4[1];
+}
+
// d_float8 bitwiseAND
__device__ __forceinline__ void rpp_hip_math_bitwiseAnd8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
@@ -1869,6 +1885,21 @@ __device__ __forceinline__ float rpp_hip_math_sinc(float x)
return (fabsf(x) < 1e-5f) ? (1.0f - x * x * ONE_OVER_6) : sinf(x) / x;
}
+__device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst_f8)
+{
+ for(int i = 0; i < 8; i++)
+ src_f8->f1[i] = (!src_f8->f1[i]) ? std::nextafter(0.0f, 1.0f) : fabsf(src_f8->f1[i]);
+
+ dst_f8->f1[0] = __logf(src_f8->f1[0]);
+ dst_f8->f1[1] = __logf(src_f8->f1[1]);
+ dst_f8->f1[2] = __logf(src_f8->f1[2]);
+ dst_f8->f1[3] = __logf(src_f8->f1[3]);
+ dst_f8->f1[4] = __logf(src_f8->f1[4]);
+ dst_f8->f1[5] = __logf(src_f8->f1[5]);
+ dst_f8->f1[6] = __logf(src_f8->f1[6]);
+ dst_f8->f1[7] = __logf(src_f8->f1[7]);
+}
+
// /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/
template
@@ -1913,7 +1944,8 @@ __device__ __forceinline__ float rpp_hip_rng_xorwow_f32(T *xorwowState)
return outFloat - 1; // return 0 <= outFloat < 1
}
-__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(RpptXorwowState *xorwowState, d_float8 *randomNumbersPtr_f8)
+template
+__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(T *xorwowState, d_float8 *randomNumbersPtr_f8)
{
randomNumbersPtr_f8->f1[0] = rpp_hip_rng_xorwow_f32(xorwowState);
randomNumbersPtr_f8->f1[1] = rpp_hip_rng_xorwow_f32(xorwowState);
diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
index b98145be0..466e51e09 100644
--- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp
+++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
#include "kernel/subtract_scalar.hpp"
#include "kernel/multiply_scalar.hpp"
#include "kernel/magnitude.hpp"
+#include "kernel/log.hpp"
#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp
index 9388ed6bd..ce7450aab 100644
--- a/src/modules/cpu/host_tensor_effects_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp
@@ -31,6 +31,8 @@ SOFTWARE.
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"
+#include "kernel/jitter.hpp"
+#include "kernel/glitch.hpp"
#include "kernel/water.hpp"
#include "kernel/ricap.hpp"
#include "kernel/vignette.hpp"
diff --git a/src/modules/cpu/host_tensor_geometric_augmentations.hpp b/src/modules/cpu/host_tensor_geometric_augmentations.hpp
index cc7a22c8f..9facb0d78 100644
--- a/src/modules/cpu/host_tensor_geometric_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_geometric_augmentations.hpp
@@ -35,6 +35,8 @@ SOFTWARE.
#include "kernel/warp_affine.hpp"
#include "kernel/phase.hpp"
#include "kernel/slice.hpp"
+#include "kernel/lens_correction.hpp"
+#include "kernel/transpose.hpp"
#include "kernel/crop_and_patch.hpp"
#include "kernel/flip_voxel.hpp"
diff --git a/src/modules/cpu/kernel/color_temperature.hpp b/src/modules/cpu/kernel/color_temperature.hpp
index 1358ac800..dbe33a51e 100644
--- a/src/modules/cpu/kernel/color_temperature.hpp
+++ b/src/modules/cpu/kernel/color_temperature.hpp
@@ -30,7 +30,7 @@ RppStatus color_temperature_u8_u8_host_tensor(Rpp8u *srcPtr,
RpptDescPtr srcDescPtr,
Rpp8u *dstPtr,
RpptDescPtr dstDescPtr,
- Rpp8s *adjustmentValueTensor,
+ Rpp32s *adjustmentValueTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
RppLayoutParams layoutParams)
@@ -269,7 +269,7 @@ RppStatus color_temperature_f32_f32_host_tensor(Rpp32f *srcPtr,
RpptDescPtr srcDescPtr,
Rpp32f *dstPtr,
RpptDescPtr dstDescPtr,
- Rpp8s *adjustmentValueTensor,
+ Rpp32s *adjustmentValueTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
RppLayoutParams layoutParams)
@@ -508,7 +508,7 @@ RppStatus color_temperature_f16_f16_host_tensor(Rpp16f *srcPtr,
RpptDescPtr srcDescPtr,
Rpp16f *dstPtr,
RpptDescPtr dstDescPtr,
- Rpp8s *adjustmentValueTensor,
+ Rpp32s *adjustmentValueTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
RppLayoutParams layoutParams)
@@ -799,7 +799,7 @@ RppStatus color_temperature_i8_i8_host_tensor(Rpp8s *srcPtr,
RpptDescPtr srcDescPtr,
Rpp8s *dstPtr,
RpptDescPtr dstDescPtr,
- Rpp8s *adjustmentValueTensor,
+ Rpp32s *adjustmentValueTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
RppLayoutParams layoutParams)
diff --git a/src/modules/cpu/kernel/glitch.hpp b/src/modules/cpu/kernel/glitch.hpp
new file mode 100644
index 000000000..9a8e33410
--- /dev/null
+++ b/src/modules/cpu/kernel/glitch.hpp
@@ -0,0 +1,690 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+inline void compute_src_loc(int row , int col, Rpp32s *locArray, RpptDescPtr srcDescPtr, RpptChannelOffsets *rgbOffsets, RpptROI roi, int batchCount, int channelValue)
+{
+ int xR, yR, xG, yG, xB, yB;
+ xR = col + rgbOffsets[batchCount].r.x;
+ yR = row + rgbOffsets[batchCount].r.y;
+ xG = col + rgbOffsets[batchCount].g.x;
+ yG = row + rgbOffsets[batchCount].g.y;
+ xB = col + rgbOffsets[batchCount].b.x;
+ yB = row + rgbOffsets[batchCount].b.y;
+
+ if (xR >= roi.xywhROI.roiWidth || xR < roi.xywhROI.xy.x || yR >= roi.xywhROI.roiHeight || yR < roi.xywhROI.xy.y)
+ {
+ xR = col;
+ yR = row;
+ }
+
+ if (xG >= roi.xywhROI.roiWidth || xG < roi.xywhROI.xy.x || yG >= roi.xywhROI.roiHeight || yG < roi.xywhROI.xy.y)
+ {
+ xG = col;
+ yG = row;
+ }
+
+ if (xB >= roi.xywhROI.roiWidth || xB < roi.xywhROI.xy.x || yB >= roi.xywhROI.roiHeight || yB < roi.xywhROI.xy.y)
+ {
+ xB = col;
+ yB = row;
+ }
+
+ locArray[0] = yR * srcDescPtr->strides.hStride + xR * channelValue;
+ locArray[1] = yG * srcDescPtr->strides.hStride + xG * channelValue;
+ locArray[2] = yB * srcDescPtr->strides.hStride + xB * channelValue;
+}
+
+RppStatus glitch_u8_u8_host_tensor(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+ Rpp8u *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp8u *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u* dstRowPtrTempR = dstPtrRow;
+ Rpp8u* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+ Rpp8u* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ __m256 p[3];
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ rpp_simd_load(rpp_glitch_load24_u8pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray);
+ rpp_simd_store(rpp_store24_f32pln3_to_u8pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p); // simd stores
+
+ dstRowPtrTempR += 8;
+ dstRowPtrTempG += 8;
+ dstRowPtrTempB += 8;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+ *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+ *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u vectorIncrement = 16;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 16) * 16) - 16;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+ {
+ __m256 p[6];
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ Rpp32u rLoc = glitchSrcLocArray[0];
+ Rpp32u gLoc = srcDescPtr->strides.cStride + glitchSrcLocArray[1];
+ Rpp32u bLoc = 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2];
+ rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrChannel + rLoc, srcPtrChannel + gLoc, srcPtrChannel + bLoc, p);
+ rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p); // simd stores
+ dstPtrTemp += 48;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 3;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u vectorIncrement = 32;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 32) * 32) - 32;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ {
+ __m256i p;
+ p = _mm256_loadu_si256((__m256i *)(srcPtrChannel + (glitchSrcLocArray[c] + (c * srcDescPtr->strides.cStride))));
+ _mm256_storeu_si256((__m256i *)(dstPtrTemp + (c * srcDescPtr->strides.cStride)), p);
+ }
+ dstPtrTemp += 32;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 1;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 10) * 10) - 10;
+ Rpp32s vectorIncrement = 10;
+ Rpp32s vectorIncrementPkd = 30;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 10)
+ {
+ __m256i p;
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ rpp_simd_load(rpp_glitch_load30_u8pkd3_to_u8pkd3_avx, srcPtrChannel, glitchSrcLocArray, p);
+ _mm256_storeu_si256((__m256i *)(dstPtrTemp), p);
+ dstPtrTemp += 30;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ for (int c = 0; c < 3; c++)
+ *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus glitch_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f* dstRowPtrTempR = dstPtrRow;
+ Rpp32f* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+ Rpp32f* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ __m256 p[3];
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ rpp_simd_load(rpp_glitch_load24_f32pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p); // simd stores
+
+ dstRowPtrTempR += 8;
+ dstRowPtrTempG += 8;
+ dstRowPtrTempB += 8;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+ *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+ *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u vectorIncrement = 8;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ __m256 p[3];
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ p[0] = _mm256_loadu_ps(srcPtrChannel + glitchSrcLocArray[0]);
+ p[1] = _mm256_loadu_ps(srcPtrChannel + srcDescPtr->strides.cStride + glitchSrcLocArray[1]);
+ p[2] = _mm256_loadu_ps(srcPtrChannel + 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2]);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores
+ dstPtrTemp += 24;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 3;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u vectorIncrement = 8;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ {
+ __m256 p;
+ p = _mm256_loadu_ps(srcPtrChannel + (glitchSrcLocArray[c] + c * srcDescPtr->strides.cStride));
+ _mm256_storeu_ps((dstPtrTemp + c * srcDescPtr->strides.cStride), p);
+ }
+ dstPtrTemp += 8;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 1;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 2) * 2) - 2;
+ Rpp32s vectorIncrement = 2;
+ Rpp32s vectorIncrementPkd = 6;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 2)
+ {
+ __m256 p;
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ rpp_simd_load(rpp_glitch_load6_f32pkd3_to_f32pkd3_avx, srcPtrChannel, glitchSrcLocArray, p);
+ _mm256_storeu_si256((__m256i *)(dstPtrTemp), p);
+ dstPtrTemp += 6;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ for (int c = 0; c < 3; c++)
+ *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus glitch_f16_f16_host_tensor(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+ Rpp16f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp16f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f* dstRowPtrTempR = dstPtrRow;
+ Rpp16f* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+ Rpp16f* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+ for (int vectorLoopCount = 0; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+ *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+ *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f* dstPtrTemp = dstPtrRow;
+ for (int vectorLoopCount = 0; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 3;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f* dstPtrTemp = dstPtrRow;
+ for (int i = 0; i < roi.xywhROI.roiWidth; i++)
+ {
+ compute_src_loc(dstLocRow, i, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 1;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f* dstPtrTemp = dstPtrRow;
+ for (int i = 0; i < roi.xywhROI.roiWidth; i++)
+ {
+ compute_src_loc(dstLocRow, i, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ for (int c = 0; c < 3; c++)
+ *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus glitch_i8_i8_host_tensor(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration.
+
+ Rpp8s *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+ Rpp8s *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s* dstRowPtrTempR = dstPtrRow;
+ Rpp8s* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride;
+ Rpp8s* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 8)
+ {
+ __m256 p[3];
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ rpp_simd_load(rpp_glitch_load24_i8pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray);
+ rpp_simd_store(rpp_store24_f32pln3_to_i8pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p); // simd stores
+
+ dstRowPtrTempR += 8;
+ dstRowPtrTempG += 8;
+ dstRowPtrTempB += 8;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0);
+ *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1);
+ *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u vectorIncrement = 16;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 16) * 16) - 16;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 16)
+ {
+ __m256 p[6];
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ Rpp32u rLoc = glitchSrcLocArray[0];
+ Rpp32u gLoc = srcDescPtr->strides.cStride + glitchSrcLocArray[1];
+ Rpp32u bLoc = 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2];
+ rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrChannel + rLoc, srcPtrChannel + gLoc, srcPtrChannel + bLoc, p);
+ rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p); // simd stores
+ dstPtrTemp += 48;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 3;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW ))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u vectorIncrement = 32;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 32) * 32) - 32;
+
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ {
+ __m256i p;
+ p = _mm256_loadu_si256((__m256i *)(srcPtrChannel + (glitchSrcLocArray[c] + (c * srcDescPtr->strides.cStride))));
+ _mm256_storeu_si256((__m256i *)(dstPtrTemp + (c * srcDescPtr->strides.cStride)), p);
+ }
+ dstPtrTemp += 32;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1);
+ for (int c = 0; c < 3; c++)
+ *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride);
+ dstPtrTemp += 1;
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 10) * 10) - 10;
+ Rpp32s vectorIncrement = 10;
+ Rpp32s vectorIncrementPkd = 30;
+ for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s* dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += 10)
+ {
+ __m256i p;
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ rpp_simd_load(rpp_glitch_load30_i8pkd3_to_i8pkd3_avx, srcPtrChannel, glitchSrcLocArray, p);
+ _mm256_storeu_si256((__m256i *)(dstPtrTemp), p);
+ dstPtrTemp += 30;
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3);
+ for (int c = 0; c < 3; c++)
+ *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c);
+ }
+
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ }
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/jitter.hpp b/src/modules/cpu/kernel/jitter.hpp
new file mode 100644
index 000000000..ec717150a
--- /dev/null
+++ b/src/modules/cpu/kernel/jitter.hpp
@@ -0,0 +1,929 @@
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus jitter_u8_u8_host_tensor(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp8u *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp8u *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth - 1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_u8pkd3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, srcDescPtr->c, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_u8pln3_to_u8pkd3_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_u8_to_u8_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRow + loc);
+ *dstPtrTemp++ = *(srcPtrRow + 1 + loc);
+ *dstPtrTemp++ = *(srcPtrRow + 2 + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp8u *dstPtrTempChn, *srcPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ __m256i pxRow;
+ rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow);
+ rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow));
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp8u *dstPtrTempChn = dstPtrTemp;
+ Rpp8u *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = *(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus jitter_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f32pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 pxRow[4];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ __m256 pRow;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, (srcPtrChannel + loc), &pRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &pRow);
+ dstPtrTemp += 3;
+ }
+#endif
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f *srcPtrTempChn, *dstPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+
+ for (int c = 0; c < dstDescPtr->c; c++)
+ {
+ __m256 pxRow;
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrTempChn, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTempChn, &pxRow);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32f *dstPtrTempChn = dstPtrTemp;
+ Rpp32f *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = (Rpp32f)*(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus jitter_f16_f16_host_tensor(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp16f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp16f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+ __m256 pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f16pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, pxRow);
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+ dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+ dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+ }
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f dstPtrTemp_ps[25];
+ __m256 pxRow[4];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, pxRow);
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32f srcPtrTemp_ps[8], dstPtrTemp_ps[8];
+ Rpp32s loc;
+ __m256 pRow;
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp16f)srcPtrChannel[loc + cnt];
+ }
+
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &pRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pRow);
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+ }
+ dstPtrTemp += 3;
+ }
+#endif
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp16f *srcPtrTempChn, *dstPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+
+ for (int c = 0; c < dstDescPtr->c; c++)
+ {
+ Rpp32f dstPtrTemp_ps[8];
+ __m256 pxRow;
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrTempChn, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pxRow);
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ dstPtrTempChn[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+ }
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp16f *dstPtrTempChn = dstPtrTemp;
+ Rpp16f *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = (Rpp16f)*(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus jitter_i8_i8_host_tensor(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp8s *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp8s *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_i8pkd3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_i8pln3_to_i8pkd3_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_i8_to_i8_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + loc);
+ *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 1 + loc);
+ *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 2 + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp8s *dstPtrTempChn, *srcPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ __m256i pxRow;
+ rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow);
+ rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow));
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp8s *dstPtrTempChn = dstPtrTemp;
+ Rpp8s *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = (Rpp8s)*(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/lens_correction.hpp b/src/modules/cpu/kernel/lens_correction.hpp
new file mode 100644
index 000000000..1632568a5
--- /dev/null
+++ b/src/modules/cpu/kernel/lens_correction.hpp
@@ -0,0 +1,178 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+#include
+
+// Compute Inverse matrix (3x3)
+inline void get_inverse(float *mat, float *invMat)
+{
+ float det = mat[0] * (mat[4] * mat[8] - mat[7] * mat[5]) - mat[1] * (mat[3] * mat[8] - mat[5] * mat[6]) + mat[2] * (mat[3] * mat[7] - mat[4] * mat[6]);
+ if(det != 0)
+ {
+ float invDet = 1 / det;
+ invMat[0] = (mat[4] * mat[8] - mat[7] * mat[5]) * invDet;
+ invMat[1] = (mat[2] * mat[7] - mat[1] * mat[8]) * invDet;
+ invMat[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * invDet;
+ invMat[3] = (mat[5] * mat[6] - mat[3] * mat[8]) * invDet;
+ invMat[4] = (mat[0] * mat[8] - mat[2] * mat[6]) * invDet;
+ invMat[5] = (mat[3] * mat[2] - mat[0] * mat[5]) * invDet;
+ invMat[6] = (mat[3] * mat[7] - mat[6] * mat[4]) * invDet;
+ invMat[7] = (mat[6] * mat[1] - mat[0] * mat[7]) * invDet;
+ invMat[8] = (mat[0] * mat[4] - mat[3] * mat[1]) * invDet;
+ }
+}
+
+inline void compute_lens_correction_remap_tables_host_tensor(RpptDescPtr srcDescPtr,
+ Rpp32f *rowRemapTable,
+ Rpp32f *colRemapTable,
+ RpptDescPtr tableDescPtr,
+ Rpp32f *cameraMatrixTensor,
+ Rpp32f *distortionCoeffsTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ Rpp32f *rowRemapTableTemp, *colRemapTableTemp;
+ rowRemapTableTemp = rowRemapTable + batchCount * tableDescPtr->strides.nStride;
+ colRemapTableTemp = colRemapTable + batchCount * tableDescPtr->strides.nStride;
+
+ // cameraMatrix is a 3x3 matrix thus increment by 9 to iterate from one tensor in a batch to another
+ Rpp32f *cameraMatrix = cameraMatrixTensor + batchCount * 9;
+ Rpp32f *distortionCoeffs = distortionCoeffsTensor + batchCount * 8;
+ Rpp32s height = roiTensorPtrSrc[batchCount].xywhROI.roiHeight;
+ Rpp32s width = roiTensorPtrSrc[batchCount].xywhROI.roiWidth;
+ Rpp32u alignedLength = width & ~7;
+ Rpp32s vectorIncrement = 8;
+
+ Rpp32f invCameraMatrix[9];
+ std::fill(invCameraMatrix, invCameraMatrix + 9, 0.0f); // initialize all values in invCameraMatrix to zero
+ get_inverse(cameraMatrix, invCameraMatrix);
+ Rpp32f *invMat = &invCameraMatrix[0];
+
+ // Get radial and tangential distortion coefficients
+ Rpp32f rCoeff[6] = { distortionCoeffs[0], distortionCoeffs[1], distortionCoeffs[4], distortionCoeffs[5], distortionCoeffs[6], distortionCoeffs[7] };
+ Rpp32f tCoeff[2] = { distortionCoeffs[2], distortionCoeffs[3] };
+
+ __m256 pRCoeff[6], pTCoeff[2];
+ pRCoeff[0] = _mm256_set1_ps(rCoeff[0]);
+ pRCoeff[1] = _mm256_set1_ps(rCoeff[1]);
+ pRCoeff[2] = _mm256_set1_ps(rCoeff[2]);
+ pRCoeff[3] = _mm256_set1_ps(rCoeff[3]);
+ pRCoeff[4] = _mm256_set1_ps(rCoeff[4]);
+ pRCoeff[5] = _mm256_set1_ps(rCoeff[5]);
+ pTCoeff[0] = _mm256_set1_ps(tCoeff[0]);
+ pTCoeff[1] = _mm256_set1_ps(tCoeff[1]);
+
+ Rpp32f u0 = cameraMatrix[2], v0 = cameraMatrix[5];
+ Rpp32f fx = cameraMatrix[0], fy = cameraMatrix[4];
+ __m256 pFx, pFy, pU0, pV0;
+ pFx = _mm256_set1_ps(fx);
+ pFy = _mm256_set1_ps(fy);
+ pU0 = _mm256_set1_ps(u0);
+ pV0 = _mm256_set1_ps(v0);
+
+ __m256 pInvMat0, pInvMat3, pInvMat6;
+ pInvMat0 = _mm256_set1_ps(invMat[0]);
+ pInvMat3 = _mm256_set1_ps(invMat[3]);
+ pInvMat6 = _mm256_set1_ps(invMat[6]);
+
+ __m256 pXCameraInit, pYCameraInit, pZCameraInit;
+ __m256 pXCameraIncrement, pYCameraIncrement, pZCameraIncrement;
+ pXCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat0);
+ pYCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat3);
+ pZCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat6);
+ pXCameraIncrement = _mm256_mul_ps(pInvMat0, avx_p8);
+ pYCameraIncrement = _mm256_mul_ps(pInvMat3, avx_p8);
+ pZCameraIncrement = _mm256_mul_ps(pInvMat6, avx_p8);
+ for(int i = 0; i < height; i++)
+ {
+ Rpp32f *rowRemapTableRow = rowRemapTableTemp + i * tableDescPtr->strides.hStride;
+ Rpp32f *colRemapTableRow = colRemapTableTemp + i * tableDescPtr->strides.hStride;
+ Rpp32f xCamera = i * invMat[1] + invMat[2];
+ Rpp32f yCamera = i * invMat[4] + invMat[5];
+ Rpp32f zCamera = i * invMat[7] + invMat[8];
+ __m256 pXCamera = _mm256_add_ps(_mm256_set1_ps(xCamera), pXCameraInit);
+ __m256 pYCamera = _mm256_add_ps(_mm256_set1_ps(yCamera), pYCameraInit);
+ __m256 pZCamera = _mm256_add_ps(_mm256_set1_ps(zCamera), pZCameraInit);
+ int vectorLoopCount = 0;
+ for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ // float z = 1./zCamera, x = xCamera*z, y = yCamera*z;
+ __m256 pZ = _mm256_div_ps(avx_p1, pZCamera);
+ __m256 pX = _mm256_mul_ps(pXCamera, pZ);
+ __m256 pY = _mm256_mul_ps(pYCamera, pZ);
+
+ // float xSquare = x*x, ySquare = y*y, r2 = xSquare + ySquare;
+ __m256 pXSquare = _mm256_mul_ps(pX, pX);
+ __m256 pYSquare = _mm256_mul_ps(pY, pY);
+ __m256 pR2 = _mm256_add_ps(pXSquare, pYSquare);
+
+ // float xyMul2 = 2*x*y;
+ __m256 p2xy = _mm256_mul_ps(avx_p2, _mm256_mul_ps(pX, pY));
+
+ // float kr = std::fmaf(std::fmaf(std::fmaf(rCoeff[2], r2, rCoeff[1]), r2, rCoeff[0]), r2, 1) / std::fmaf(std::fmaf(std::fmaf(rCoeff[5], r2, rCoeff[4]), r2, rCoeff[3]), r2, 1);
+ __m256 pNum = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(pRCoeff[2], pR2, pRCoeff[1]), pR2, pRCoeff[0]), pR2, avx_p1);
+ __m256 pDen = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(pRCoeff[5], pR2, pRCoeff[4]), pR2, pRCoeff[3]), pR2, avx_p1);
+ __m256 pKR = _mm256_div_ps(pNum, pDen);
+
+ // float colLoc = std::fmaf(fx, (std::fmaf(tCoeff[1], (std::fmaf(2, xSquare, r2)), std::fmaf(x, kr, (tCoeff[0] * xyMul2)))), u0);
+ __m256 pColLoc = _mm256_fmadd_ps(pFx, _mm256_fmadd_ps(pTCoeff[1], _mm256_fmadd_ps(avx_p2, pXSquare, pR2), _mm256_fmadd_ps(pX, pKR, _mm256_mul_ps(pTCoeff[0], p2xy))), pU0);
+
+ // float rowLoc = std::fmaf(fy, (std::fmaf(tCoeff[0], (std::fmaf(2, ySquare, r2)), std::fmaf(y, kr, (tCoeff[1] * xyMul2)))), v0);
+ __m256 pRowLoc = _mm256_fmadd_ps(pFy, _mm256_fmadd_ps(pTCoeff[0], _mm256_fmadd_ps(avx_p2, pYSquare, pR2), _mm256_fmadd_ps(pY, pKR, _mm256_mul_ps(pTCoeff[1], p2xy))), pV0);
+
+ _mm256_storeu_ps(rowRemapTableRow, pRowLoc);
+ _mm256_storeu_ps(colRemapTableRow, pColLoc);
+ rowRemapTableRow += vectorIncrement;
+ colRemapTableRow += vectorIncrement;
+
+ // xCamera += invMat[0], yCamera += invMat[3], zCamera += invMat[6]
+ pXCamera = _mm256_add_ps(pXCamera, pXCameraIncrement);
+ pYCamera = _mm256_add_ps(pYCamera, pYCameraIncrement);
+ pZCamera = _mm256_add_ps(pZCamera, pZCameraIncrement);
+ }
+ for(; vectorLoopCount < width; vectorLoopCount++)
+ {
+ Rpp32f z = 1./zCamera, x = xCamera * z, y = yCamera * z;
+ Rpp32f xSquare = x * x, ySquare = y * y, r2 = xSquare + ySquare;
+ Rpp32f xyMul2 = 2 * x * y;
+ Rpp32f kr = std::fmaf(std::fmaf(std::fmaf(rCoeff[2], r2, rCoeff[1]), r2, rCoeff[0]), r2, 1) / std::fmaf(std::fmaf(std::fmaf(rCoeff[5], r2, rCoeff[4]), r2, rCoeff[3]), r2, 1);
+ Rpp32f colLoc = std::fmaf(fx, (std::fmaf(tCoeff[1], (std::fmaf(2, xSquare, r2)), std::fmaf(x, kr, (tCoeff[0] * xyMul2)))), u0);
+ Rpp32f rowLoc = std::fmaf(fy, (std::fmaf(tCoeff[0], (std::fmaf(2, ySquare, r2)), std::fmaf(y, kr, (tCoeff[1] * xyMul2)))), v0);
+ *rowRemapTableRow++ = rowLoc;
+ *colRemapTableRow++ = colLoc;
+ xCamera += invMat[0];
+ yCamera += invMat[3];
+ zCamera += invMat[6];
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/log.hpp b/src/modules/cpu/kernel/log.hpp
new file mode 100644
index 000000000..5ec79b21c
--- /dev/null
+++ b/src/modules/cpu/kernel/log.hpp
@@ -0,0 +1,563 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_common.hpp"
+
+// 1 pixel log helper functions
+// NOTE: log(0) leads to undefined thus using nextafter() to avoid this result
+// Also negative values are converted to positive by taking absolute of inputs
+inline void compute_log(Rpp8u *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src); }
+inline void compute_log(Rpp8s *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src + 128); }
+inline void compute_log(Rpp16f *src, Rpp16f *dst) { *dst = (!*src) ? log(std::nextafter(0.0f, 1.0f)) : log(abs(*src)); }
+inline void compute_log(Rpp32f *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(abs(*src)); }
+
+// Computes ND log recursively
+template
+void log_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim)
+{
+ if (!nDim)
+ compute_log(src, dst);
+ else
+ {
+ for (int i = 0; i < *dstShape; i++)
+ {
+ log_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1);
+ dst += *dstStrides;
+ src += *srcStrides;
+ }
+ }
+}
+
+RppStatus log_generic_host_tensor(Rpp8u *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp8u *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ alignedLength = length[0] & ~15;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8u *srcPtrTemp = srcPtr1;
+ Rpp32f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8u *srcPtrRow = srcPtr1;
+ Rpp32f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp8u *srcPtrTemp = srcPtrRow;
+ Rpp32f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp8s *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp8s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ alignedLength = length[0] & ~15;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8s *srcPtrTemp = srcPtr1;
+ Rpp32f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8s *srcPtrRow = srcPtr1;
+ Rpp32f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp8s *srcPtrTemp = srcPtrRow;
+ Rpp32f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp32f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ alignedLength = length[0] & ~15;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtr1, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr1;
+ Rpp32f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtr1;
+ Rpp32f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp32f *srcPtrTemp = srcPtrRow;
+ Rpp32f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp16f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp16f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp16f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp16f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[16];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtr1[cnt]);
+
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp16f *srcPtrTemp = srcPtr1;
+ Rpp16f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[16];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp16f *srcPtrRow = srcPtr1;
+ Rpp16f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp16f *srcPtrTemp = srcPtrRow;
+ Rpp16f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[16];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp
index dbe746d1a..94a1fd9fa 100644
--- a/src/modules/cpu/kernel/normalize.hpp
+++ b/src/modules/cpu/kernel/normalize.hpp
@@ -26,21 +26,6 @@ SOFTWARE.
#include "rpp_cpu_simd.hpp"
#include "rpp_cpu_common.hpp"
-// Computes strides
-void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim)
-{
- if (tensorDim > 0)
- {
- Rpp32u v = 1;
- for (Rpp32u i = tensorDim - 1; i > 0; i--)
- {
- strides[i] = v;
- v *= shape[i];
- }
- strides[0] = v;
- }
-}
-
// Recursive reduction helper function to compute difference of input with mean and squares them up
template
void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean)
diff --git a/src/modules/cpu/kernel/transpose.hpp b/src/modules/cpu/kernel/transpose.hpp
new file mode 100644
index 000000000..233db1044
--- /dev/null
+++ b/src/modules/cpu/kernel/transpose.hpp
@@ -0,0 +1,434 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+using namespace std;
+
+inline void increment_ndim_ptr(Rpp32f **dstPtr, Rpp32u tensorDims, Rpp32u increment)
+{
+ for(int i = 0; i < tensorDims; i++)
+ dstPtr[i] += increment;
+}
+
+inline void rpp_store16_f32_f32_channelwise(Rpp32f **dstPtr, __m128 *p)
+{
+ _mm_storeu_ps(dstPtr[0], p[0]);
+ _mm_storeu_ps(dstPtr[1], p[1]);
+ _mm_storeu_ps(dstPtr[2], p[2]);
+ _mm_storeu_ps(dstPtr[3], p[3]);
+}
+
+inline void compute_2d_pln1_transpose(Rpp32f *srcPtrTemp, Rpp32f *dstPtrTemp, Rpp32u height, Rpp32u width, Rpp32u srcRowStride, Rpp32u dstRowStride)
+{
+ Rpp32u alignedRows = height & ~3;
+ Rpp32u alignedCols = width & ~7;
+ Rpp32u vectorIncrement = 8;
+ Rpp32u dstRowVectorStride = vectorIncrement * dstRowStride;
+
+ Rpp32s i = 0;
+ for(Rpp32s k = 0; i < alignedRows; i += 4, k++)
+ {
+ Rpp32f *srcPtrRow[4], *dstPtrRow[8];
+ for(int j = 0; j < 4; j++)
+ srcPtrRow[j] = srcPtrTemp + (i + j) * srcRowStride;
+ for(int j = 0; j < 8; j++)
+ dstPtrRow[j] = dstPtrTemp + j * dstRowStride + i;
+
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for(; vectorLoopCount < alignedCols; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc[4];
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[0], &pSrc[0]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[1], &pSrc[1]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[2], &pSrc[2]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[3], &pSrc[3]);
+
+ __m128 pDst[8];
+ compute_transpose4x8_avx(pSrc, pDst);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[0], &pDst[0]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[1], &pDst[1]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[2], &pDst[2]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[3], &pDst[3]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[4], &pDst[4]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[5], &pDst[5]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[6], &pDst[6]);
+ rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[7], &pDst[7]);
+
+ srcPtrRow[0] += vectorIncrement;
+ srcPtrRow[1] += vectorIncrement;
+ srcPtrRow[2] += vectorIncrement;
+ srcPtrRow[3] += vectorIncrement;
+ dstPtrRow[0] += dstRowVectorStride;
+ dstPtrRow[1] += dstRowVectorStride;
+ dstPtrRow[2] += dstRowVectorStride;
+ dstPtrRow[3] += dstRowVectorStride;
+ dstPtrRow[4] += dstRowVectorStride;
+ dstPtrRow[5] += dstRowVectorStride;
+ dstPtrRow[6] += dstRowVectorStride;
+ dstPtrRow[7] += dstRowVectorStride;
+ }
+#endif
+ }
+
+ // handle remaining columns
+ for(Rpp32s k = 0; k < alignedRows; k++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrTemp + k * srcRowStride + alignedCols;
+ Rpp32f *dstPtrRowTemp = dstPtrTemp + alignedCols * dstRowStride + k;
+ for(Rpp32s j = alignedCols; j < width; j++)
+ {
+ *dstPtrRowTemp = *srcPtrRowTemp++;
+ dstPtrRowTemp += dstRowStride;
+ }
+ }
+
+ // handle remaining rows
+ for( ; i < height; i++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrTemp + i * srcRowStride;
+ Rpp32f *dstPtrRowTemp = dstPtrTemp + i;
+ for(Rpp32s j = 0; j < width; j++)
+ {
+ *dstPtrRowTemp = *srcPtrRowTemp;
+ srcPtrRowTemp++;
+ dstPtrRowTemp += dstRowStride;
+ }
+ }
+}
+
+template
+void transpose_generic_nd_recursive(T *dst, Rpp32u *dstStrides, T *src, Rpp32u *srcStrides, Rpp32u *dstShape, Rpp32u tensorDims)
+{
+ // exit case for recursion
+ if (tensorDims == 0)
+ {
+ *dst = *src;
+ }
+ else
+ {
+ for (int i = 0; i < *dstShape; i++)
+ {
+ transpose_generic_nd_recursive(dst, dstStrides + 1, src, srcStrides + 1, dstShape + 1, tensorDims - 1);
+ dst += *dstStrides;
+ src += *srcStrides;
+ }
+ }
+}
+
+template
+void transpose_generic_setup_and_run(T *srcPtrTemp, T *dstPtrTemp, Rpp32u *length, Rpp32u *perm, Rpp32u tensorDims)
+{
+ Rpp32u dstShape[RPPT_MAX_DIMS];
+ Rpp32u srcStrides[RPPT_MAX_DIMS];
+ Rpp32u dstStrides[RPPT_MAX_DIMS];
+
+ // compute output shape
+ for(Rpp32u i = 0; i < tensorDims; i++)
+ dstShape[i] = length[perm[i]];
+
+ // compute output strides
+ compute_strides(dstStrides, dstShape, tensorDims);
+
+ // compute input strides and update as per the permute order
+ Rpp32u tempStrides[RPPT_MAX_DIMS];
+ compute_strides(tempStrides, length, tensorDims);
+ for(int i = 0; i < tensorDims; i++)
+ srcStrides[i] = tempStrides[perm[i]];
+
+ // perform transpose as per the permute order
+ transpose_generic_nd_recursive(dstPtrTemp, dstStrides, srcPtrTemp, srcStrides, dstShape, tensorDims);
+}
+
+RppStatus transpose_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *permTensor,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u tensorDims = dstGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32f *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ // get the starting address of begin and length values from roiTensor
+ Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[tensorDims];
+ Rpp32u *perm = permTensor;
+
+ bool copyInput = true;
+ for(int i = 0; i < tensorDims; i++)
+ copyInput *= (perm[i] == i);
+
+ // do memcpy of input to output since output order is same as input order
+ if(copyInput)
+ {
+ memcpy(dstPtrTemp, srcPtrTemp, (size_t)(srcGenericDescPtr->strides[0] * sizeof(Rpp32f)));
+ }
+ else
+ {
+ for(int i = 1; i < tensorDims; i++)
+ srcPtrTemp += begin[i - 1] * srcGenericDescPtr->strides[i];
+
+ if (tensorDims == 2 && perm[0] == 1 && perm[1] == 0)
+ {
+ // Optimized AVX version for 2D PLN1 inputs
+ compute_2d_pln1_transpose(srcPtrTemp, dstPtrTemp, length[0], length[1], srcGenericDescPtr->strides[1], dstGenericDescPtr->strides[1]);
+ }
+ else if (tensorDims == 3)
+ {
+ // Optimized AVX version for 3D inputs of shape(x, y, 16) and permutation order (2, 0, 1) (usecases : Deepcam training)
+ if(perm[0] == 2 && perm[1] == 0 && perm[2] == 1 && length[2] == 16)
+ {
+ Rpp32u height = length[0];
+ Rpp32u width = length[1];
+ Rpp32u channels = 16;
+ Rpp32u bufferLength = width * channels;
+ Rpp32u alignedLength = bufferLength & ~63;
+ Rpp32u vectorIncrement = 64;
+ Rpp32u vectorIncrementPerChannel = 4;
+
+ // initialize pointers for 16 channel
+ Rpp32f *dstPtrChannel[16];
+ for(int i = 0; i < 16; i++)
+ dstPtrChannel[i] = dstPtrTemp + i * dstGenericDescPtr->strides[1];
+
+ // loop over rows
+ for(int i = 0; i < height; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtrTemp;
+
+ // update temporary pointers for 16 channel
+ Rpp32f *dstPtrTempChannel[16];
+ for(int k = 0; k < 16; k++)
+ dstPtrTempChannel[k] = dstPtrChannel[k];
+
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for( ; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc[8];
+ // load 64 values for source
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow, &pSrc[0]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 16, &pSrc[1]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 32, &pSrc[2]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 48, &pSrc[3]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 8, &pSrc[4]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 24, &pSrc[5]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 40, &pSrc[6]);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 56, &pSrc[7]);
+
+ __m128 pDst[16];
+ compute_transpose4x8_avx(&pSrc[0], &pDst[0]);
+ compute_transpose4x8_avx(&pSrc[4], &pDst[8]);
+
+ // store 4 values in output per channel
+ rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[0], &pDst[0]);
+ rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[4], &pDst[4]);
+ rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[8], &pDst[8]);
+ rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[12], &pDst[12]);
+
+ srcPtrRow += vectorIncrement;
+ increment_ndim_ptr(dstPtrTempChannel, 16, vectorIncrementPerChannel);
+ }
+#endif
+ for( ; vectorLoopCount < bufferLength; vectorLoopCount += 16)
+ {
+ for(int k = 0; k < 16; k++)
+ *dstPtrTempChannel[k] = srcPtrRow[k];
+
+ srcPtrRow += 16;
+ increment_ndim_ptr(dstPtrTempChannel, 16, 1);
+ }
+ srcPtrTemp += srcGenericDescPtr->strides[1];
+ increment_ndim_ptr(dstPtrChannel, 16, dstGenericDescPtr->dims[3]);
+ }
+ }
+ // Optimized AVX version for 3D inputs and permutation order (1, 0, 2)
+ else if(perm[0] == 1 && perm[1] == 0 && perm[2] == 2)
+ {
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ Rpp32f *dstPtrRow = dstPtrTemp;
+ Rpp32u height = length[0];
+ Rpp32u width = length[1];
+ Rpp32u channels = length[2];
+ Rpp32u copySizeInBytes = channels * sizeof(Rpp32f);
+ for(int i = 0; i < height; i++)
+ {
+ Rpp32f *srcPtrRowTemp = srcPtrRow;
+ Rpp32f *dstPtrRowTemp = dstPtrRow;
+ for(int j = 0; j < width; j++)
+ {
+ memcpy(dstPtrRowTemp, srcPtrRowTemp, copySizeInBytes);
+ srcPtrRowTemp += srcGenericDescPtr->strides[2];
+ dstPtrRowTemp += dstGenericDescPtr->strides[1];
+ }
+ srcPtrRow += srcGenericDescPtr->strides[1];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ }
+ // Optimized AVX version for 3D inputs and permutation order (0, 2, 1)
+ else if(perm[0] == 0 && perm[1] == 2 && perm[2] == 1)
+ {
+ Rpp32f *srcPtrRow = srcPtrTemp;
+ Rpp32f *dstPtrRow = dstPtrTemp;
+ for(int i = 0; i < length[0]; i++)
+ {
+ compute_2d_pln1_transpose(srcPtrTemp, dstPtrTemp, length[1], length[2], srcGenericDescPtr->strides[2], dstGenericDescPtr->strides[2]);
+
+ // increment src and dst pointers
+ srcPtrTemp += srcGenericDescPtr->strides[1];
+ dstPtrTemp += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ {
+ transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+ }
+ }
+ else if (tensorDims == 4)
+ {
+ // Optimized AVX version for 4D inputs and permutation order (1, 2, 3, 0)
+ Rpp32u vectorIncrement = 8;
+ if(perm[0] == 1 && perm[1] == 2 && perm[2] == 3 && perm[3] == 0)
+ {
+ Rpp32u bufferLength = length[perm[3]];
+ Rpp32u alignedLength = bufferLength & ~7;
+ Rpp32f *srcPtr0 = srcPtrTemp;
+ Rpp32f *dstPtr0 = dstPtrTemp;
+ Rpp32u stridesIncrement[8] = {0, srcGenericDescPtr->strides[1], 2 * srcGenericDescPtr->strides[1], 3 * srcGenericDescPtr->strides[1],
+ 4 * srcGenericDescPtr->strides[1], 5 * srcGenericDescPtr->strides[1], 6 * srcGenericDescPtr->strides[1], 7 * srcGenericDescPtr->strides[1]};
+ Rpp32u srcIncrement = vectorIncrement * srcGenericDescPtr->strides[1];
+ for(int i = 0; i < length[perm[0]]; i++)
+ {
+ Rpp32f *srcPtr1 = srcPtr0;
+ Rpp32f *dstPtr1 = dstPtr0;
+ for(int j = 0; j < length[perm[1]]; j++)
+ {
+ Rpp32f *srcPtr2 = srcPtr1;
+ Rpp32f *dstPtr2 = dstPtr1;
+ for(int k = 0; k < length[perm[2]]; k++)
+ {
+ Rpp32f *srcPtr3 = srcPtr2;
+ Rpp32f *dstPtr3 = dstPtr2;
+
+ Rpp32u vectorLoopCount = 0;
+#if __AVX2__
+ for( ; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 pSrc = _mm256_setr_ps(srcPtr3[stridesIncrement[0]], srcPtr3[stridesIncrement[1]], srcPtr3[stridesIncrement[2]], srcPtr3[stridesIncrement[3]],
+ srcPtr3[stridesIncrement[4]], srcPtr3[stridesIncrement[5]], srcPtr3[stridesIncrement[6]], srcPtr3[stridesIncrement[7]]);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtr3, &pSrc);
+ srcPtr3 += srcIncrement;
+ dstPtr3 += vectorIncrement;
+ }
+#endif
+ for( ; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtr3++ = *srcPtr3;
+ srcPtr3 += srcGenericDescPtr->strides[1];
+ }
+ srcPtr2 += 1;
+ dstPtr2 += dstGenericDescPtr->strides[3];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[3];
+ dstPtr1 += dstGenericDescPtr->strides[2];
+ }
+ srcPtr0 += srcGenericDescPtr->strides[2];
+ dstPtr0 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ {
+ transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+ }
+ }
+ else
+ {
+ transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+template
+RppStatus transpose_generic_host_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *permTensor,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u tensorDims = dstGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ T *srcPtrTemp, *dstPtrTemp;
+ srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ // get the starting address of begin and length values from roiTensor
+ Rpp32u *roi = roiTensor + batchCount * tensorDims * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[tensorDims];
+ Rpp32u *perm = permTensor;
+
+ bool copyInput = true;
+ for(int i = 0; i < tensorDims; i++)
+ copyInput *= (perm[i] == i);
+
+ // do memcpy of input to output since output order is same as input order
+ if(copyInput)
+ {
+ memcpy(dstPtrTemp, srcPtrTemp, (size_t)(srcGenericDescPtr->strides[0] * sizeof(T)));
+ }
+ else
+ {
+ for(int i = 1; i < tensorDims; i++)
+ srcPtrTemp += begin[i - 1] * srcGenericDescPtr->strides[i];
+ transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp
index 42e72db98..08eb93674 100644
--- a/src/modules/hip/handlehip.cpp
+++ b/src/modules/hip/handlehip.cpp
@@ -239,7 +239,12 @@ struct HandleImpl
}
hipMalloc(&(this->initHandle->mem.mgpu.rgbArr.rgbmem), sizeof(RpptRGB) * this->nBatchSize);
- hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 8294400); // 3840 x 2160
+
+ /* (600000 + 293 + 128) * 128 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 128)
+ - 600000 is the maximum size that will be required for MMS buffer based on Librispeech dataset
+ - 293 is the size required for storing reduction outputs for 600000 size sample
+ - 128 is the size required for storing cutOffDB values for batch size 128 */
+ hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888);
}
};
diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 37d2220b2..59e4ba3f9 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
#include "kernel/subtract_scalar.hpp"
#include "kernel/multiply_scalar.hpp"
#include "kernel/magnitude.hpp"
+#include "kernel/log.hpp"
#endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp
new file mode 100644
index 000000000..6db11e222
--- /dev/null
+++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp
@@ -0,0 +1,32 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
+#define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
+
+#include "kernel/non_silent_region_detection.hpp"
+#include "kernel/down_mixing.hpp"
+#include "kernel/to_decibels.hpp"
+
+#endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp
index abdfd30ab..12e80a1f4 100644
--- a/src/modules/hip/hip_tensor_effects_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp
@@ -31,6 +31,8 @@ SOFTWARE.
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"
+#include "kernel/jitter.hpp"
+#include "kernel/glitch.hpp"
#include "kernel/water.hpp"
#include "kernel/ricap.hpp"
#include "kernel/vignette.hpp"
diff --git a/src/modules/hip/hip_tensor_geometric_augmentations.hpp b/src/modules/hip/hip_tensor_geometric_augmentations.hpp
index dcd890139..102e7d686 100644
--- a/src/modules/hip/hip_tensor_geometric_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_geometric_augmentations.hpp
@@ -35,6 +35,8 @@ SOFTWARE.
#include "kernel/resize_crop_mirror.hpp"
#include "kernel/phase.hpp"
#include "kernel/slice.hpp"
+#include "kernel/lens_correction.hpp"
+#include "kernel/transpose.hpp"
#include "kernel/crop_and_patch.hpp"
#include "kernel/flip_voxel.hpp"
diff --git a/src/modules/hip/kernel/down_mixing.hpp b/src/modules/hip/kernel/down_mixing.hpp
new file mode 100644
index 000000000..041780e32
--- /dev/null
+++ b/src/modules/hip/kernel/down_mixing.hpp
@@ -0,0 +1,72 @@
+#include
+#include "rpp_hip_common.hpp"
+
+__global__ void down_mixing_hip_tensor(float *srcPtr,
+ uint srcStride,
+ float *dstPtr,
+ uint dstStride,
+ int2 *srcDimsTensor)
+
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ int srcLength = srcDimsTensor[id_z].x;
+ int channels = srcDimsTensor[id_z].y;
+
+ if (id_x >= srcLength)
+ return;
+
+ float outVal = 0.0f;
+ uint srcIdx = id_z * srcStride + id_x * channels;
+ int i = 0;
+ int alignedChannels = (channels / 8) * 8;
+
+ // do 8 pixel load till alignedChannels value
+ if (alignedChannels)
+ {
+ d_float8 outVal_f8;
+ outVal_f8.f4[0] = static_cast(0.0f);
+ outVal_f8.f4[1] = outVal_f8.f4[0];
+ for(; i < alignedChannels; i += 8, srcIdx += 8)
+ {
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ rpp_hip_math_add8(&outVal_f8, &src_f8, &outVal_f8);
+ }
+ outVal_f8.f4[0] += outVal_f8.f4[1];
+ outVal += (outVal_f8.f1[0] + outVal_f8.f1[1] + outVal_f8.f1[2] + outVal_f8.f1[3]);
+ }
+ // process remaining channels
+ for(; i < channels; i++, srcIdx++)
+ outVal += srcPtr[srcIdx];
+ outVal *= (1.f / channels);
+
+ uint dstIdx = id_z * dstStride + id_x;
+ dstPtr[dstIdx] = outVal;
+}
+
+RppStatus hip_exec_down_mixing_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcDimsTensor,
+ bool normalizeWeights,
+ rpp::Handle& handle)
+{
+ Rpp32s globalThreads_x = dstDescPtr->strides.nStride;
+ Rpp32s globalThreads_y = 1;
+ Rpp32s globalThreads_z = dstDescPtr->n;
+
+ hipLaunchKernelGGL(down_mixing_hip_tensor,
+ dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcDescPtr->strides.nStride,
+ dstPtr,
+ dstDescPtr->strides.nStride,
+ reinterpret_cast(srcDimsTensor));
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/erase.hpp b/src/modules/hip/kernel/erase.hpp
index 2591b53f0..f18306a9d 100644
--- a/src/modules/hip/kernel/erase.hpp
+++ b/src/modules/hip/kernel/erase.hpp
@@ -117,12 +117,34 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
int globalThreads_y = dstDescPtr->h;
int globalThreads_z = handle.GetBatchSize();
- if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ if (dstDescPtr->layout == RpptLayout::NHWC)
{
- if (srcDescPtr->dataType == RpptDataType::U8)
+ // if src layout is NHWC, copy src to dst
+ if (srcDescPtr->layout == RpptLayout::NHWC)
{
- hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp8u)), hipMemcpyDeviceToDevice, handle.GetStream());
+ hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(T)), hipMemcpyDeviceToDevice, handle.GetStream());
hipStreamSynchronize(handle.GetStream());
+ }
+ // if src layout is NCHW, convert src from NCHW to NHWC
+ else if (srcDescPtr->layout == RpptLayout::NCHW)
+ {
+ globalThreads_x = (dstDescPtr->w + 7) >> 3;
+ hipLaunchKernelGGL(convert_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ globalThreads_x = dstDescPtr->w;
+ hipStreamSynchronize(handle.GetStream());
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
hipLaunchKernelGGL(erase_pkd_hip_tensor,
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -137,8 +159,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
}
else if (srcDescPtr->dataType == RpptDataType::F16)
{
- hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp16f)), hipMemcpyDeviceToDevice, handle.GetStream());
- hipStreamSynchronize(handle.GetStream());
hipLaunchKernelGGL(erase_pkd_hip_tensor,
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -153,8 +173,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
}
else if (srcDescPtr->dataType == RpptDataType::F32)
{
- hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp32f)), hipMemcpyDeviceToDevice, handle.GetStream());
- hipStreamSynchronize(handle.GetStream());
hipLaunchKernelGGL(erase_pkd_hip_tensor,
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -169,8 +187,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
}
else if (srcDescPtr->dataType == RpptDataType::I8)
{
- hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp8s)), hipMemcpyDeviceToDevice, handle.GetStream());
- hipStreamSynchronize(handle.GetStream());
hipLaunchKernelGGL(erase_pkd_hip_tensor,
dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)),
dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
@@ -245,33 +261,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr,
numBoxesTensor,
roiTensorPtrSrc);
}
- else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
- {
- globalThreads_x = (dstDescPtr->w + 7) >> 3;
- hipLaunchKernelGGL(convert_pln3_pkd3_hip_tensor,
- dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
- dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
- 0,
- handle.GetStream(),
- srcPtr,
- make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
- dstPtr,
- make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
- roiTensorPtrSrc);
- hipStreamSynchronize(handle.GetStream());
- globalThreads_x = dstDescPtr->w;
- hipLaunchKernelGGL(erase_pkd_hip_tensor,
- dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
- dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
- 0,
- handle.GetStream(),
- dstPtr,
- make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
- anchorBoxInfoTensor,
- colorsTensor,
- numBoxesTensor,
- roiTensorPtrSrc);
- }
}
return RPP_SUCCESS;
diff --git a/src/modules/hip/kernel/glitch.hpp b/src/modules/hip/kernel/glitch.hpp
new file mode 100644
index 000000000..81c7013c0
--- /dev/null
+++ b/src/modules/hip/kernel/glitch.hpp
@@ -0,0 +1,278 @@
+#include
+#include "rpp_hip_common.hpp"
+
+template
+__device__ __forceinline__ void rpp_hip_load1_glitch(T *srcPtr, uint2 srcStrideCH, float &locSrcX, float &locSrcY, float *dst, int channels)
+{
+ int srcIdx = locSrcY * srcStrideCH.y + locSrcX * srcStrideCH.x + channels;
+ rpp_hip_interpolate1_nearest_neighbor_load_pln1(srcPtr + srcIdx, dst);
+}
+
+template
+__device__ __forceinline__ void rpp_hip_load8_glitch(T *srcPtr, uint2 srcStrideCH, d_float8 *srcX_f8, d_float8 *srcY_f8, d_float8 *dst_f8, int channels)
+{
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[0], srcY_f8->f1[0], &(dst_f8->f1[0]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[1], srcY_f8->f1[1], &(dst_f8->f1[1]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[2], srcY_f8->f1[2], &(dst_f8->f1[2]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[3], srcY_f8->f1[3], &(dst_f8->f1[3]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[4], srcY_f8->f1[4], &(dst_f8->f1[4]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[5], srcY_f8->f1[5], &(dst_f8->f1[5]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[6], srcY_f8->f1[6], &(dst_f8->f1[6]), channels);
+ rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[7], srcY_f8->f1[7], &(dst_f8->f1[7]), channels);
+}
+
+__device__ void check_locs(d_float8 &xLocVals, d_float8 &yLocVals, RppiPoint offset, RpptROI roiTensorPtrSrc)
+{
+ for(int i = 0; i < 8; i++)
+ {
+ if (xLocVals.f1[i] >= roiTensorPtrSrc.ltrbROI.rb.x || xLocVals.f1[i] < roiTensorPtrSrc.ltrbROI.lt.x || yLocVals.f1[i] >= roiTensorPtrSrc.ltrbROI.rb.y || yLocVals.f1[i] < roiTensorPtrSrc.ltrbROI.lt.y)
+ {
+ xLocVals.f1[i] -= offset.x;
+ yLocVals.f1[i] -= offset.y;
+ }
+ }
+}
+
+__device__ void compute_glitch_locs_hip(int id_x, int id_y, RpptChannelOffsets rgbOffsets, RpptROI roiTensorPtrSrc, d_float24 *srcLocsX_f24, d_float24 *srcLocsY_f24)
+{
+ float4 increment_f4;
+ increment_f4 = make_float4(0.0f, 1.0f, 2.0f, 3.0f); // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs
+
+ srcLocsX_f24->f4[0] = static_cast(id_x + rgbOffsets.r.x) + increment_f4; // find R channel srcLocsX 0, 1, 2, 3
+ srcLocsX_f24->f4[1] = srcLocsX_f24->f4[0] + (float4) 4; // find R channel srcLocsX 4, 5, 6, 7
+ srcLocsY_f24->f4[0] = srcLocsY_f24->f4[1] = static_cast(id_y + rgbOffsets.r.y); // find R channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7
+ check_locs(srcLocsX_f24->f8[0], srcLocsY_f24->f8[0], rgbOffsets.r, roiTensorPtrSrc); // check if all srcLocs in roi bounds
+
+ srcLocsX_f24->f4[2] = static_cast(id_x + rgbOffsets.g.x) + increment_f4; // find G channel srcLocsX 0, 1, 2, 3
+ srcLocsX_f24->f4[3] = srcLocsX_f24->f4[2] +(float4) 4; // find G channel srcLocsX 4, 5, 6, 7
+ srcLocsY_f24->f4[2] = srcLocsY_f24->f4[3] = static_cast(id_y + rgbOffsets.g.y); // find G channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7
+ check_locs(srcLocsX_f24->f8[1], srcLocsY_f24->f8[1], rgbOffsets.g, roiTensorPtrSrc); // check if all srcLocs in roi bounds
+
+ srcLocsX_f24->f4[4] = static_cast(id_x + rgbOffsets.b.x) + increment_f4; // find B channel srcLocsX 0, 1, 2, 3
+ srcLocsX_f24->f4[5] = srcLocsX_f24->f4[4] + (float4) 4; // find B channel srcLocsX 4, 5, 6, 7
+ srcLocsY_f24->f4[4] = srcLocsY_f24->f4[5] = static_cast(id_y + rgbOffsets.b.y); // find B channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7
+ check_locs(srcLocsX_f24->f8[2], srcLocsY_f24->f8[2], rgbOffsets.b, roiTensorPtrSrc); // check if all srcLocs in roi bounds
+}
+
+template
+__global__ void glitch_pkd_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptChannelOffsets *rgbOffsetsPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+ uint2 srcStrideCH = make_uint2(3, srcStridesNH.y);
+ d_float24 dst_f24, srcLocsX_f24, srcLocsY_f24;
+
+ compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &(dst_f24.f8[0]), 0);
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &(dst_f24.f8[1]), 1);
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &(dst_f24.f8[2]), 2);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void glitch_pln_hip_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ RpptChannelOffsets *rgbOffsetsPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+ uint2 srcStrideCH = make_uint2(1, srcStridesNCH.z);
+
+ d_float24 srcLocsX_f24, srcLocsY_f24;
+ d_float8 dst_f8;
+
+ compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &dst_f8, 0);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &dst_f8, 0);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &dst_f8, 0);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void glitch_pkd3_pln3_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ RpptChannelOffsets *rgbOffsetsPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+ uint2 srcStrideCH = make_uint2(3, srcStridesNH.y);
+
+ d_float24 srcLocsX_f24, srcLocsY_f24;
+ d_float8 dst_f8;
+
+ compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &dst_f8, 0);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ dstIdx += dstStridesNCH.y;
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &dst_f8, 1);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ dstIdx += dstStridesNCH.y;
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &dst_f8, 2);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void glitch_pln3_pkd3_hip_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptChannelOffsets *rgbOffsetsPtr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z];
+ uint2 srcStrideCH = make_uint2(1, srcStridesNCH.z);
+
+ d_float24 dst_f24, srcLocsX_f24, srcLocsY_f24;
+ compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24);
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &(dst_f24.f8[0]), 0);
+
+ srcIdx += srcStridesNCH.y;
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &(dst_f24.f8[1]), 0);
+
+ srcIdx += srcStridesNCH.y;
+ rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &(dst_f24.f8[2]), 0);
+
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_glitch_tensor(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+ int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = dstDescPtr->n;
+
+ if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(glitch_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ rgbOffsets,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(glitch_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ rgbOffsets,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(glitch_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ rgbOffsets,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(glitch_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ rgbOffsets,
+ roiTensorPtrSrc);
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/jitter.hpp b/src/modules/hip/kernel/jitter.hpp
new file mode 100644
index 000000000..bbc407cda
--- /dev/null
+++ b/src/modules/hip/kernel/jitter.hpp
@@ -0,0 +1,314 @@
+#include
+#include "rpp_hip_common.hpp"
+#include "rng_seed_stream.hpp"
+
+__device__ __forceinline__ void jitter_roi_and_srclocs_hip_compute(int4 *srcRoiPtr_i4, RpptXorwowStateBoxMuller *xorwowState, uint kernelSize, uint bound, int id_x, int id_y, d_float16 *locSrc_f16)
+{
+ d_float8 widthIncrement_f8, heightIncrement_f8;
+ rpp_hip_rng_8_xorwow_f32(xorwowState, &widthIncrement_f8);
+ rpp_hip_math_multiply8_const(&widthIncrement_f8, &widthIncrement_f8, static_cast(kernelSize));
+ rpp_hip_rng_8_xorwow_f32(xorwowState, &heightIncrement_f8);
+ rpp_hip_math_multiply8_const(&heightIncrement_f8, &heightIncrement_f8, static_cast(kernelSize));
+
+ d_float8 increment_f8, locDst_f8x, locDst_f8y;
+ increment_f8.f4[0] = make_float4(0.0f, 1.0f, 2.0f, 3.0f); // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs
+ increment_f8.f4[1] = make_float4(4.0f, 5.0f, 6.0f, 7.0f);
+ locDst_f8x.f4[0] = static_cast(id_x) + increment_f8.f4[0];
+ locDst_f8x.f4[1] = static_cast(id_x) + increment_f8.f4[1];
+ locDst_f8y.f4[0] = locDst_f8y.f4[1] = (float4)id_y;
+
+ locSrc_f16->f8[0].f4[0] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[0] + widthIncrement_f8.f4[0] - static_cast(bound);
+ locSrc_f16->f8[0].f4[1] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[1] + widthIncrement_f8.f4[1] - static_cast(bound);
+ locSrc_f16->f8[1].f4[0] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[0] + heightIncrement_f8.f4[0] - static_cast(bound);
+ locSrc_f16->f8[1].f4[1] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[1] + heightIncrement_f8.f4[1] - static_cast(bound);
+
+ // Apply boundary checks and adjustments
+ for(int i = 0; i < 8; ++i)
+ {
+ locSrc_f16->f1[i] = fmaxf(fminf(floorf(locSrc_f16->f1[i]), static_cast(srcRoiPtr_i4->z - 1)), 0.0f);
+ locSrc_f16->f1[i + 8] = fmaxf(fminf(floorf(locSrc_f16->f1[i + 8]), static_cast(srcRoiPtr_i4->w - bound)), 0.0f);
+ }
+}
+
+template
+__global__ void jitter_pkd_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3);
+ uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float24 dst_f24;
+ rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24);
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void jitter_pln_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+ uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float8 dst_f8;
+ rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void jitter_pkd3_pln3_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+ uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float24 dst_f24;
+ rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24);
+ rpp_hip_pack_float24_pkd3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void jitter_pln3_pkd3_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3);
+ uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float24 dst_f24;
+ rpp_hip_interpolate24_nearest_neighbor_pln3(srcPtr + srcIdx, &srcStridesNCH, &locSrc_f16, &srcRoi_i4, &dst_f24);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_jitter_tensor(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ uint *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = dstDescPtr->n;
+
+ Rpp32u *xorwowSeedStream;
+ xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
+ CHECK_RETURN_STATUS(hipMemcpyAsync(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice, handle.GetStream()));
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(jitter_pkd_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(jitter_pln_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(jitter_pkd3_pln3_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+ hipLaunchKernelGGL(jitter_pln3_pkd3_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/lens_correction.hpp b/src/modules/hip/kernel/lens_correction.hpp
new file mode 100644
index 000000000..0d53db7e1
--- /dev/null
+++ b/src/modules/hip/kernel/lens_correction.hpp
@@ -0,0 +1,183 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - lens_correction device helpers --------------------
+
+__device__ __forceinline__ void camera_coordinates_hip_compute(d_float8 *cameraCoords_f8, int id_y, d_float8 *locDst_f8x, float3 *inverseMatrix)
+{
+ float4 inverseCoord1_f4 = static_cast(id_y * inverseMatrix->y + inverseMatrix->z);
+ float4 inverseCoord2_f4 = static_cast(inverseMatrix->x);
+ cameraCoords_f8->f4[0] = inverseCoord1_f4 + locDst_f8x->f4[0] * inverseCoord2_f4;
+ cameraCoords_f8->f4[1] = inverseCoord1_f4 + locDst_f8x->f4[1] * inverseCoord2_f4;
+}
+
+// -------------------- Set 1 - lens_correction kernels --------------------
+
+// compute inverse of 3x3 camera matrix
+__global__ void compute_inverse_matrix_hip_tensor(d_float9 *matTensor, d_float9 *invMatTensor)
+{
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ d_float9 *mat_f9 = &matTensor[id_z];
+ d_float9 *invMat_f9 = &invMatTensor[id_z];
+
+ // initialize all values in invMat_f9 to zero
+ invMat_f9->f3[0] = static_cast(0.0f);
+ invMat_f9->f3[1] = invMat_f9->f3[0];
+ invMat_f9->f3[2] = invMat_f9->f3[0];
+
+ // compute determinant mat_f9
+ float det = (mat_f9->f1[0] * ((mat_f9->f1[4] * mat_f9->f1[8]) - (mat_f9->f1[7] * mat_f9->f1[5])))
+ - (mat_f9->f1[1] * ((mat_f9->f1[3] * mat_f9->f1[8]) - (mat_f9->f1[5] * mat_f9->f1[6])))
+ + (mat_f9->f1[2] * ((mat_f9->f1[3] * mat_f9->f1[7]) - (mat_f9->f1[4] * mat_f9->f1[6])));
+ if(det != 0)
+ {
+ float invDet = 1 / det;
+ invMat_f9->f1[0] = (mat_f9->f1[4] * mat_f9->f1[8] - mat_f9->f1[7] * mat_f9->f1[5]) * invDet;
+ invMat_f9->f1[1] = (mat_f9->f1[2] * mat_f9->f1[7] - mat_f9->f1[1] * mat_f9->f1[8]) * invDet;
+ invMat_f9->f1[2] = (mat_f9->f1[1] * mat_f9->f1[5] - mat_f9->f1[2] * mat_f9->f1[4]) * invDet;
+ invMat_f9->f1[3] = (mat_f9->f1[5] * mat_f9->f1[6] - mat_f9->f1[3] * mat_f9->f1[8]) * invDet;
+ invMat_f9->f1[4] = (mat_f9->f1[0] * mat_f9->f1[8] - mat_f9->f1[2] * mat_f9->f1[6]) * invDet;
+ invMat_f9->f1[5] = (mat_f9->f1[3] * mat_f9->f1[2] - mat_f9->f1[0] * mat_f9->f1[5]) * invDet;
+ invMat_f9->f1[6] = (mat_f9->f1[3] * mat_f9->f1[7] - mat_f9->f1[6] * mat_f9->f1[4]) * invDet;
+ invMat_f9->f1[7] = (mat_f9->f1[6] * mat_f9->f1[1] - mat_f9->f1[0] * mat_f9->f1[7]) * invDet;
+ invMat_f9->f1[8] = (mat_f9->f1[0] * mat_f9->f1[4] - mat_f9->f1[3] * mat_f9->f1[1]) * invDet;
+ }
+}
+
+// compute remap tables from the camera matrix and distortion coefficients
+__global__ void compute_remap_tables_hip_tensor(float *rowRemapTable,
+ float *colRemapTable,
+ d_float9 *cameraMatrixTensor,
+ d_float9 *inverseMatrixTensor,
+ d_float8 *distortionCoeffsTensor,
+ uint2 remapTableStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ d_float9 cameraMatrix_f9 = cameraMatrixTensor[id_z];
+ d_float9 inverseMatrix_f9 = inverseMatrixTensor[id_z];
+ d_float8 distortionCoeffs_f8 = distortionCoeffsTensor[id_z];
+
+ // Get radial and tangential distortion coefficients
+ float radialCoeff[6] = {distortionCoeffs_f8.f1[0], distortionCoeffs_f8.f1[1], distortionCoeffs_f8.f1[4], distortionCoeffs_f8.f1[5], distortionCoeffs_f8.f1[6], distortionCoeffs_f8.f1[7]};
+ float tangentialCoeff[2] = {distortionCoeffs_f8.f1[2], distortionCoeffs_f8.f1[3]};
+
+ uint dstIdx = id_z * remapTableStridesNH.x + id_y * remapTableStridesNH.y + id_x;
+ d_float8 locDst_f8x;
+ locDst_f8x.f4[0] = static_cast(id_x) + make_float4(0, 1, 2, 3);
+ locDst_f8x.f4[1] = static_cast(id_x) + make_float4(4, 5, 6, 7);
+
+ float4 one_f4 = static_cast(1.0f);
+ float4 two_f4 = static_cast(2.0f);
+ d_float8 z_f8, y_f8, x_f8;
+ camera_coordinates_hip_compute(&z_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[2]); // float zCamera = id_y * inverseMatrix.f1[7] + inverseMatrix.f1[8] + id_x * inverseMatrix.f1[6]
+ camera_coordinates_hip_compute(&y_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[1]); // float yCamera = id_y * inverseMatrix.f1[4] + inverseMatrix.f1[5] + id_x * inverseMatrix.f1[3]
+ camera_coordinates_hip_compute(&x_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[0]); // float xCamera = id_y * inverseMatrix.f1[1] + inverseMatrix.f1[2] + id_x * inverseMatrix.f1[0]
+ rpp_hip_math_divide8_const(&z_f8, &z_f8, one_f4); // float z = 1./zCamera
+ rpp_hip_math_multiply8(&y_f8, &z_f8, &y_f8); // float y = yCamera * z;
+ rpp_hip_math_multiply8(&x_f8, &z_f8, &x_f8); // float x = xCamera * z;
+
+ d_float8 ySquare_f8, xSquare_f8;
+ rpp_hip_math_multiply8(&y_f8, &y_f8, &ySquare_f8); // float ySquare = x * x
+ rpp_hip_math_multiply8(&x_f8, &x_f8, &xSquare_f8); // float xSquare = x * x
+
+ d_float8 r2_f8, kr_f8, kr1_f8, kr2_f8;
+ rpp_hip_math_add8(&xSquare_f8, &ySquare_f8, &r2_f8); // float r2 = xSquare + ySquare
+
+ d_float8 r2Cube_f8, r2Square_f8;
+ rpp_hip_math_multiply8(&r2_f8, &r2_f8, &r2Square_f8); // float r2Square = r2 * r2;
+ rpp_hip_math_multiply8(&r2Square_f8, &r2_f8, &r2Cube_f8); // float r2Cube = r2Square * r2;
+
+ d_float24 radialCoeff_f24;
+ radialCoeff_f24.f4[0] = static_cast(radialCoeff[0]);
+ radialCoeff_f24.f4[1] = static_cast(radialCoeff[1]);
+ radialCoeff_f24.f4[2] = static_cast(radialCoeff[2]);
+ radialCoeff_f24.f4[3] = static_cast(radialCoeff[3]);
+ radialCoeff_f24.f4[4] = static_cast(radialCoeff[4]);
+ radialCoeff_f24.f4[5] = static_cast(radialCoeff[5]);
+
+ // float kr = (1 + (radialCoeff[2] * r2Cube) + (radialCoeff[1] * r2Square) + (radialCoeff[0]) * r2)) / (1 + (radialCoeff[5] * r2Cube) + (radialCoeff[4] * r2Square) + (radialCoeff[3]) *r2))
+ kr1_f8.f4[0] = (one_f4 + (radialCoeff_f24.f4[2] * r2Cube_f8.f4[0]) + (radialCoeff_f24.f4[1] * r2Square_f8.f4[0]) + (radialCoeff_f24.f4[0] * r2_f8.f4[0]));
+ kr1_f8.f4[1] = (one_f4 + (radialCoeff_f24.f4[2] * r2Cube_f8.f4[1]) + (radialCoeff_f24.f4[1] * r2Square_f8.f4[1]) + (radialCoeff_f24.f4[0] * r2_f8.f4[1]));
+ kr2_f8.f4[0] = (one_f4 + (radialCoeff_f24.f4[5] * r2Cube_f8.f4[0]) + (radialCoeff_f24.f4[4] * r2Square_f8.f4[0]) + (radialCoeff_f24.f4[3] * r2_f8.f4[0]));
+ kr2_f8.f4[1] = (one_f4 + (radialCoeff_f24.f4[5] * r2Cube_f8.f4[1]) + (radialCoeff_f24.f4[4] * r2Square_f8.f4[1]) + (radialCoeff_f24.f4[3] * r2_f8.f4[1]));
+ rpp_hip_math_divide8(&kr1_f8, &kr2_f8, &kr_f8);
+
+ d_float8 xyMul2_f8;
+ rpp_hip_math_multiply8(&x_f8, &y_f8, &xyMul2_f8);
+ rpp_hip_math_multiply8_const(&xyMul2_f8, &xyMul2_f8, two_f4); // float xyMul2 = 2 * x * y
+
+ d_float8 colLoc_f8, rowLoc_f8;
+ rpp_hip_math_multiply8_const(&xSquare_f8, &xSquare_f8, two_f4); // xSquare = xSquare * 2;
+ rpp_hip_math_multiply8_const(&ySquare_f8, &ySquare_f8, two_f4); // ySquare = ySquare * 2;
+
+ d_float16 cameraMatrix_f16;
+ cameraMatrix_f16.f4[0] = static_cast(cameraMatrix_f9.f1[0]);
+ cameraMatrix_f16.f4[1] = static_cast(cameraMatrix_f9.f1[2]);
+ cameraMatrix_f16.f4[2] = static_cast(cameraMatrix_f9.f1[4]);
+ cameraMatrix_f16.f4[3] = static_cast(cameraMatrix_f9.f1[5]);
+
+ d_float8 tangentialCoeff_f8;
+ tangentialCoeff_f8.f4[0] = static_cast(tangentialCoeff[0]);
+ tangentialCoeff_f8.f4[1] = static_cast(tangentialCoeff[1]);
+
+ // float colLoc = cameraMatrix[0] * (x * kr + tangentialCoeff[0] * xyMul2 + tangentialCoeff[1] * (r2 + 2 * xSquare)) + cameraMatrix[2];
+ colLoc_f8.f4[0] = cameraMatrix_f16.f4[0] * ((x_f8.f4[0] * kr_f8.f4[0]) + (tangentialCoeff_f8.f4[0] * xyMul2_f8.f4[0]) + (tangentialCoeff_f8.f4[1] * (r2_f8.f4[0] + xSquare_f8.f4[0]))) + cameraMatrix_f16.f4[1];
+ colLoc_f8.f4[1] = cameraMatrix_f16.f4[0] * ((x_f8.f4[1] * kr_f8.f4[1]) + (tangentialCoeff_f8.f4[0] * xyMul2_f8.f4[1]) + (tangentialCoeff_f8.f4[1] * (r2_f8.f4[1] + xSquare_f8.f4[1]))) + cameraMatrix_f16.f4[1];
+
+ // float rowLoc = cameraMatrix[4] * (y * kr + tangentialCoeff[1] * xyMul2 + tangentialCoeff[0] * (r2 + 2 * ySquare)) + cameraMatrix[4];
+ rowLoc_f8.f4[0] = cameraMatrix_f16.f4[2] * ((y_f8.f4[0] * kr_f8.f4[0]) + (tangentialCoeff_f8.f4[1] * xyMul2_f8.f4[0]) + (tangentialCoeff_f8.f4[0] * (r2_f8.f4[0] + ySquare_f8.f4[0]))) + cameraMatrix_f16.f4[3];
+ rowLoc_f8.f4[1] = cameraMatrix_f16.f4[2] * ((y_f8.f4[1] * kr_f8.f4[1]) + (tangentialCoeff_f8.f4[1] * xyMul2_f8.f4[1]) + (tangentialCoeff_f8.f4[0] * (r2_f8.f4[1] + ySquare_f8.f4[1]))) + cameraMatrix_f16.f4[3];
+
+ rpp_hip_pack_float8_and_store8(colRemapTable + dstIdx, &colLoc_f8);
+ rpp_hip_pack_float8_and_store8(rowRemapTable + dstIdx, &rowLoc_f8);
+}
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+RppStatus hip_exec_lens_correction_tensor(RpptDescPtr dstDescPtr,
+ Rpp32f *rowRemapTable,
+ Rpp32f *colRemapTable,
+ RpptDescPtr remapTableDescPtr,
+ Rpp32f *cameraMatrix,
+ Rpp32f *distanceCoeffs,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = dstDescPtr->n;
+
+ float *inverseMatrix = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ hipLaunchKernelGGL(compute_inverse_matrix_hip_tensor,
+ dim3(1, 1, ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(1, 1, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ reinterpret_cast(cameraMatrix),
+ reinterpret_cast(inverseMatrix));
+ hipLaunchKernelGGL(compute_remap_tables_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ rowRemapTable,
+ colRemapTable,
+ reinterpret_cast(cameraMatrix),
+ reinterpret_cast(inverseMatrix),
+ reinterpret_cast(distanceCoeffs),
+ make_uint2(remapTableDescPtr->strides.nStride, remapTableDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/log.hpp b/src/modules/hip/kernel/log.hpp
new file mode 100644
index 000000000..a481a1e07
--- /dev/null
+++ b/src/modules/hip/kernel/log.hpp
@@ -0,0 +1,232 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 1 - helper kernels --------------------
+template
+__device__ void log_hip_compute(T *srcPtr, d_float8 *src_f8, d_float8 *dst_f8)
+{
+ if constexpr (std::is_same::value)
+ rpp_hip_math_add8_const(src_f8, src_f8, (float4)128);
+
+ rpp_hip_math_log(src_f8, dst_f8);
+}
+
+// -------------------- Set 2 - log kernels --------------------
+template
+__global__ void log_1d_hip_tensor(T *srcPtr,
+ uint srcStrides,
+ U *dstPtr,
+ uint dstStrides,
+ uint *roiTensor)
+{
+ uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ uint *roi = &roiTensor[id_z * 2];
+ uint beginX = roi[0];
+ uint width = roi[1];
+
+ if (id_x >= width)
+ return;
+
+ uint srcIdx = (id_z * srcStrides) + id_x + beginX;
+ uint dstIdx = (id_z * dstStrides) + id_x;
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void log_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ U *dstPtr,
+ uint2 dstStridesNH,
+ uint *roiTensor)
+{
+ uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint beginY = roi[0];
+ uint beginX = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ if (id_x >= width || id_y >= height)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + beginY) * srcStridesNH.y) + id_x + beginX;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void log_3d_hip_tensor(T *srcPtr,
+ uint2 srcStridesDH,
+ U *dstPtr,
+ uint2 dstStridesDH,
+ uint *roiTensor)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+ uint *roi = roiTensor;
+ uint beginZ = roi[0];
+ uint beginY = roi[1];
+ uint beginX = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+ return;
+
+ uint srcIdx = ((id_z + beginZ) * srcStridesDH.x) + ((id_y + beginY) * srcStridesDH.y) + id_x + beginX;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void log_nd_hip_tensor(T *srcPtr,
+ uint *srcStrides,
+ uint *srcDims,
+ uint numDims,
+ U *dstPtr,
+ uint *dstStrides,
+ Rpp32u *roiTensor)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ if(id_x >= srcStrides[0])
+ return;
+
+ uint *roi = roiTensor + id_z * numDims * 2;
+ uint *begin = roi;
+ uint *length = &roi[numDims];
+ uint dstIdx = (id_z * *dstStrides++);
+ uint srcIdx = (id_z * *srcStrides++);
+ uint coords[RPPT_MAX_DIMS];
+
+ for (int i = 0; i < numDims; i++)
+ {
+ coords[i] = (id_x / srcStrides[i]) % srcDims[i];
+ if(coords[i] >= length[i])
+ return;
+ }
+
+ for (int i = 0; i < numDims; i++)
+ {
+ dstIdx += (coords[i] * dstStrides[i]);
+ srcIdx += (begin[i] + (coords[i] * srcStrides[i]));
+ }
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+// -------------------- Set 3 - executor kernels --------------------
+template
+RppStatus hip_exec_log_generic_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ U *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ uint *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+ // based on number of dimensions call the corresponding kernel
+ if (numDims == 1)
+ {
+ // NW
+ int globalThreads_x = dstGenericDescPtr->dims[1];
+ int globalThreads_y = 1;
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(log_1d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcGenericDescPtr->strides[0],
+ dstPtr,
+ dstGenericDescPtr->strides[0],
+ roiTensor);
+ }
+ else if (numDims == 2)
+ {
+ // NHW
+ int globalThreads_x = dstGenericDescPtr->dims[2];
+ int globalThreads_y = dstGenericDescPtr->dims[1];
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(log_2d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ dstPtr,
+ make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+ roiTensor);
+ }
+ else if (numDims == 3)
+ {
+ // NDHW
+ int globalThreads_x = dstGenericDescPtr->dims[3];
+ int globalThreads_y = dstGenericDescPtr->dims[2];
+ int globalThreads_z = dstGenericDescPtr->dims[1];
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(log_3d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ &roiTensor[batchCount * 6]);
+ }
+ }
+ else
+ {
+ // interpret the input as 1D tensor
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3;
+ int globalThreads_y = 1;
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(log_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+ dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcGenericDescPtr->strides,
+ srcGenericDescPtr->dims + 1,
+ srcGenericDescPtr->numDims - 1,
+ dstPtr,
+ dstGenericDescPtr->strides,
+ roiTensor);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/non_silent_region_detection.hpp b/src/modules/hip/kernel/non_silent_region_detection.hpp
new file mode 100644
index 000000000..80511464b
--- /dev/null
+++ b/src/modules/hip/kernel/non_silent_region_detection.hpp
@@ -0,0 +1,426 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - moving mean square kernel device helpers --------------------
+
+// calculate the position in shared memory to avoid bank conflicts
+__host__ __device__ __forceinline__ int compute_pos_in_smem(int pos)
+{
+ return pos + (pos >> 5); // since shared memory banks considered is 32
+}
+
+/* compute prefix sum on the input buffer passed
+ prefix sum of an array is an array where each element is the sum of all previous elements in the input array, inclusive of the current element */
+__device__ __forceinline__ void compute_prefix_sum(float *input, uint bufferLength)
+{
+ int offset = 1;
+ int2 offset_i2 = static_cast(offset);
+ int2 offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+ int threadIdxMul2 = 2 * hipThreadIdx_x;
+ int blockDimMul2 = 2 * hipBlockDim_x;
+
+ /* compute intermediate prefix sums in a up sweep manner
+ (each level in the hierarchy doubles the distance between the pairs of elements being added) */
+ for (int d = bufferLength >> 1; d > 0; d >>= 1)
+ {
+ // syncthreads before proceeding to next iteration
+ __syncthreads();
+ int dMul2 = 2 * d;
+ for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2)
+ {
+ int2 pos_i2 = (offset_i2 * static_cast(idxMul2)) + offsetAB_i2;
+ input[compute_pos_in_smem(pos_i2.y)] += input[compute_pos_in_smem(pos_i2.x)];
+ }
+ offset <<= 1;
+ offset_i2 = static_cast(offset);
+ offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ int last = bufferLength - 1;
+ input[compute_pos_in_smem(last)] = 0;
+ }
+
+ /* compute final prefix sums in a down sweep manner
+ (each level in the hierarchy halves the distance between the pairs of elements being added) */
+ for (int d = 1; d < bufferLength; d <<= 1)
+ {
+ offset >>= 1;
+ offset_i2 = static_cast(offset);
+ offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+ __syncthreads();
+ // syncthreads before proceeding to next iteration
+
+ int dMul2 = 2 * d;
+ for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2)
+ {
+ int2 pos_i2 = offset_i2 * static_cast(idxMul2) + offsetAB_i2;
+ int posA = compute_pos_in_smem(pos_i2.x);
+ int posB = compute_pos_in_smem(pos_i2.y);
+ float t = input[posA];
+ input[posA] = input[posB];
+ input[posB] += t;
+ }
+ }
+ __syncthreads();
+}
+
+// -------------------- Set 1 - moving mean square compute kernel --------------------
+
+__global__ void moving_mean_square_hip_tensor(float *srcPtr,
+ uint nStride,
+ float *mmsArr,
+ int *srcLengthTensor,
+ int outputTileLength,
+ int windowLength,
+ float windowFactor,
+ int inputTileLength)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint srcLength = srcLengthTensor[id_z];
+ uint batchStride = id_z * nStride;
+ int blockStart = hipBlockIdx_x * outputTileLength;
+
+ if (blockStart >= srcLength)
+ return;
+
+ float *input = srcPtr + batchStride;
+ extern __shared__ float squaredPrefixSum_smem[];
+
+ float *inBlockPtr = srcPtr + batchStride + blockStart;
+ float *outBlockPtr = mmsArr + batchStride + blockStart;
+
+ // find the valid output tile length values needed for given block
+ int validOutputTileLength = std::min(outputTileLength, srcLength - blockStart);
+
+ // assign pointers that points to block begin and block end locations
+ float *extendedBlockStart = inBlockPtr - windowLength;
+ float *extendedBlockEnd = inBlockPtr + validOutputTileLength;
+
+ // load input data to shared memory
+ for(int pos = hipThreadIdx_x; pos < inputTileLength; pos += hipBlockDim_x)
+ {
+ float val = 0.0f;
+ auto extendedBlockPtr = extendedBlockStart + pos;
+
+ /* check if extendedBlockPtr is within the valid region of input
+ and load the value from extendedBlockPtr if it is within valid region */
+ if (extendedBlockPtr >= input && extendedBlockPtr < extendedBlockEnd)
+ val = *extendedBlockPtr;
+ squaredPrefixSum_smem[compute_pos_in_smem(pos)] = val * val;
+ }
+
+ // compute prefix sum
+ compute_prefix_sum(squaredPrefixSum_smem, inputTileLength);
+
+ // compute the mms value here
+ for(int pos = hipThreadIdx_x; pos < validOutputTileLength; pos += hipBlockDim_x)
+ outBlockPtr[pos] = windowFactor * ((inBlockPtr[pos] * inBlockPtr[pos]) + squaredPrefixSum_smem[compute_pos_in_smem(windowLength + pos)] - squaredPrefixSum_smem[compute_pos_in_smem(pos + 1)]);
+}
+
+// -------------------- Set 2 - kernels for finding cutoffmag value --------------------
+
+__global__ void max_reduction_hip_tensor(float *srcPtr,
+ uint nStride,
+ float *maxArr,
+ int *srcLengthTensor)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint srcLength = srcLengthTensor[id_z];
+
+ uint srcIdx = id_z * nStride;
+ __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
+ max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads
+
+ if (id_x >= srcLength)
+ return;
+
+ if (id_x + 8 > srcLength)
+ id_x -= (id_x + 8 - srcLength);
+
+ srcIdx += id_x;
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int dstIdx = id_z * hipGridDim_x + hipBlockIdx_x;
+ maxArr[dstIdx] = max_smem[0];
+ }
+}
+
+__global__ void cutoffmag_hip_tensor(float *srcPtr,
+ int maxLength,
+ float *cutOffMagPtr,
+ float cutOff,
+ float referencePower,
+ bool referenceMax)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ // if referenceMax is set to true, perform final max reduction on srcPtr and compute cutOffMag
+ if(referenceMax)
+ {
+ uint srcIdx = id_z * maxLength;
+ __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
+ max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads
+
+ if (id_x >= maxLength)
+ return;
+
+ srcIdx += id_x;
+ float maxVal = srcPtr[srcIdx];
+ while (id_x < maxLength)
+ {
+ maxVal = fmaxf(maxVal, srcPtr[srcIdx]);
+ id_x += hipBlockDim_x;
+ srcIdx += hipBlockDim_x;
+ }
+ max_smem[hipThreadIdx_x] = maxVal;
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ cutOffMagPtr[id_z] = max_smem[0] * cutOff;
+ }
+ else
+ {
+ if (hipThreadIdx_x == 0)
+ cutOffMagPtr[id_z] = referencePower * cutOff;
+ }
+}
+
+// -------------------- Set 3 - kernels for finding begin and length of NSR in inputs --------------------
+
+__global__ void find_region_hip_tensor(float *srcPtr,
+ uint nStride,
+ int *beginTensor,
+ int *lengthTensor,
+ float *cutOffMagPtr,
+ int *srcLengthTensor,
+ float windowLength)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint srcLength = srcLengthTensor[id_z];
+ float cutOffMag = cutOffMagPtr[id_z];
+
+ __shared__ int beginResult;
+ __shared__ int endResult;
+ beginResult = srcLength;
+ endResult = 0;
+ __syncthreads();
+
+ int beginIdx = srcLength;
+ int endIdx = 0;
+ uint stridePerSample = id_z * nStride;
+
+ // Find the begin index in src whose value is >= cutOffMag
+ for (int i = id_x; i < srcLength; i += hipBlockDim_x)
+ {
+ uint srcIdx = stridePerSample + i;
+ if (srcPtr[srcIdx] >= cutOffMag)
+ {
+ beginIdx = i;
+ atomicMin(&beginResult, beginIdx);
+ if(beginResult != srcLength)
+ break;
+ }
+ }
+
+ // Find the end index in src whose value is >= cutOffMag
+ for (int i = id_x; i < srcLength; i += hipBlockDim_x)
+ {
+ uint srcIdx = stridePerSample + srcLength - 1 - i;
+ if (srcPtr[srcIdx] >= cutOffMag)
+ {
+ endIdx = srcLength - 1 - i;
+ atomicMax(&endResult, endIdx);
+ if(endResult != 0)
+ break;
+ }
+ }
+
+ // Final store to dst
+ if(hipThreadIdx_x == 0)
+ {
+ if(beginResult == srcLength || endResult == 0)
+ {
+ beginTensor[id_z] = 0;
+ lengthTensor[id_z] = 0;
+ }
+ else
+ {
+ int detectBegin = beginResult;
+ int detectEnd = endResult - beginResult + 1;
+
+ // if both starting index and length of nonsilent region is not 0
+ // adjust the values as per the windowLength
+ if(detectBegin != 0 && detectEnd != 0)
+ {
+ int newBegin = max(detectBegin - (windowLength - 1), 0);
+ detectEnd += detectBegin - newBegin;
+ detectBegin = newBegin;
+ }
+ beginTensor[id_z] = detectBegin;
+ lengthTensor[id_z] = detectEnd;
+ }
+ }
+}
+
+// -------------------- Set 4 - host helpers for kernel executor --------------------
+
+// return the nearest previous power of 2 for the given number
+inline Rpp32s prev_pow2(Rpp32s n)
+{
+ Rpp32s pow2 = 1;
+ while (n - pow2 > pow2)
+ pow2 += pow2;
+
+ return pow2;
+}
+
+// return the nearest next power of 2 for the given number
+inline Rpp32s next_pow2(Rpp32s n)
+{
+ Rpp32s pow2 = 1;
+ while (n > pow2)
+ pow2 += pow2;
+
+ return pow2;
+}
+
+// -------------------- Set 5 - non silent region kernels executor --------------------
+
+RppStatus hip_exec_non_silent_region_detection_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32s *srcLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
+ Rpp32f cutOffDB,
+ Rpp32s windowLength,
+ Rpp32f referencePower,
+ Rpp32s resetInterval,
+ rpp::Handle& handle)
+{
+ // check if scratch memory size required for moving mean square is within the limits
+ if ((srcDescPtr->n * srcDescPtr->strides.nStride) > MMS_MAX_SCRATCH_MEMORY)
+ return RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE;
+
+ Rpp32f *mmsArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ Rpp32s maxSharedMemoryInBytes = handle.GetLocalMemorySize();
+ Rpp32s maxSharedMemoryElements = maxSharedMemoryInBytes / sizeof(Rpp32f);
+ Rpp32s kSharedMemBanks = 32;
+ Rpp32s inputTileLength = prev_pow2(maxSharedMemoryElements * kSharedMemBanks / (kSharedMemBanks + 1));
+
+ if (resetInterval > 0 && resetInterval < inputTileLength)
+ {
+ Rpp32s p = prev_pow2(resetInterval);
+ Rpp32s n = next_pow2(resetInterval);
+ if (p > windowLength)
+ inputTileLength = p;
+ else if (n < inputTileLength)
+ inputTileLength = n;
+ }
+
+ Rpp32s sharedMemorySizeInBytes = compute_pos_in_smem(inputTileLength) * sizeof(Rpp32f);
+ Rpp32s outputTileLength = inputTileLength - windowLength;
+ Rpp32f windowFactor = 1.0f / windowLength;
+
+ if (outputTileLength <= 0)
+ return RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH;
+
+ if (sharedMemorySizeInBytes > maxSharedMemoryInBytes)
+ return RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE;
+
+ // launch kernel to compute the values needed for MMS Array
+ Rpp32s globalThreads_x = ceil(static_cast(srcDescPtr->strides.nStride) / outputTileLength);
+ Rpp32s globalThreads_y = 1;
+ Rpp32s globalThreads_z = srcDescPtr->n;
+
+ hipLaunchKernelGGL(moving_mean_square_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ sharedMemorySizeInBytes,
+ handle.GetStream(),
+ srcPtr,
+ srcDescPtr->strides.nStride,
+ mmsArr,
+ srcLengthTensor,
+ outputTileLength,
+ windowLength,
+ windowFactor,
+ inputTileLength);
+
+ const Rpp32f cutOff = std::pow(10.0f, cutOffDB * 0.1f);
+ bool referenceMax = (!referencePower);
+ Rpp32f *partialMaxArr = mmsArr + srcDescPtr->n * srcDescPtr->strides.nStride;
+
+ Rpp32s numBlocksPerSample = ceil(static_cast(srcDescPtr->strides.nStride) / (LOCAL_THREADS_X_1DIM * 8));
+ Rpp32s cutOffMagKernelBlockSize = 1;
+ if (referenceMax)
+ {
+ // compute max value in MMS buffer
+ hipLaunchKernelGGL(max_reduction_hip_tensor,
+ dim3(numBlocksPerSample, 1, globalThreads_z),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ mmsArr,
+ srcDescPtr->strides.nStride,
+ partialMaxArr,
+ srcLengthTensor);
+ cutOffMagKernelBlockSize = 256;
+ }
+ // find the cutoff value in magnitude
+ Rpp32f *cutOffMagPtr = partialMaxArr + globalThreads_z * numBlocksPerSample;
+ hipLaunchKernelGGL(cutoffmag_hip_tensor,
+ dim3(1, 1, globalThreads_z),
+ dim3(cutOffMagKernelBlockSize, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ numBlocksPerSample,
+ cutOffMagPtr,
+ cutOff,
+ referencePower,
+ referenceMax);
+
+ // find the begin and length values of NSR in inputs
+ hipLaunchKernelGGL(find_region_hip_tensor,
+ dim3(1, 1, globalThreads_z),
+ dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ mmsArr,
+ srcDescPtr->strides.nStride,
+ detectedIndexTensor,
+ detectionLengthTensor,
+ cutOffMagPtr,
+ srcLengthTensor,
+ windowLength);
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/to_decibels.hpp b/src/modules/hip/kernel/to_decibels.hpp
new file mode 100644
index 000000000..e1d45d098
--- /dev/null
+++ b/src/modules/hip/kernel/to_decibels.hpp
@@ -0,0 +1,312 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - to_decibels device helpers --------------------
+
+__device__ __forceinline__ void to_decibels_hip_compute(d_float8 *src_f8, d_float8 *dst_f8, double minRatio, float multiplier, float inverseMagnitude)
+{
+ dst_f8->f1[0] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[0]) * inverseMagnitude)));
+ dst_f8->f1[1] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[1]) * inverseMagnitude)));
+ dst_f8->f1[2] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[2]) * inverseMagnitude)));
+ dst_f8->f1[3] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[3]) * inverseMagnitude)));
+ dst_f8->f1[4] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[4]) * inverseMagnitude)));
+ dst_f8->f1[5] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[5]) * inverseMagnitude)));
+ dst_f8->f1[6] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[6]) * inverseMagnitude)));
+ dst_f8->f1[7] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[7]) * inverseMagnitude)));
+}
+
+// -------------------- Set 1 - kernels for finding inverse magnitude value --------------------
+
+__global__ void inverse_magnitude_hip_tensor(float *srcPtr,
+ int maxLength,
+ bool computeMax,
+ float *inverseMagnitudeTensor)
+
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ // Do final reduction on block wise max
+ if (computeMax)
+ {
+ uint srcIdx = id_z * maxLength;
+ __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
+ max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads
+
+ if (id_x >= maxLength)
+ return;
+
+ srcIdx += id_x;
+ float maxVal = srcPtr[srcIdx];
+ while (id_x < maxLength)
+ {
+ maxVal = fmaxf(maxVal, srcPtr[srcIdx]);
+ id_x += hipBlockDim_x;
+ srcIdx += hipBlockDim_x;
+ }
+ max_smem[hipThreadIdx_x] = maxVal;
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ inverseMagnitudeTensor[id_z] = 1.f / max_smem[0];
+ }
+ else
+ {
+ inverseMagnitudeTensor[id_z] = 1.0f;
+ }
+}
+
+__global__ void max_reduction_1d_hip_tensor(float *srcPtr,
+ uint2 srcStridesNH,
+ RpptImagePatchPtr srcDims,
+ float *maxArr)
+{
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+
+ uint srcLength = srcDims[id_z].height;
+ uint srcIdx = id_z * srcStridesNH.x;
+ __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
+ max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads
+
+ if (id_x >= srcLength)
+ return;
+
+ srcIdx += id_x;
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ maxArr[id_z * hipGridDim_x + hipBlockIdx_x] = max_smem[0];
+}
+
+__global__ void max_reduction_2d_hip_tensor(float *srcPtr,
+ uint2 srcStridesNH,
+ RpptImagePatchPtr srcDims,
+ float *maxArr)
+{
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+
+ __shared__ float partialMax_smem[16][16]; // 16 rows of src, 16 reduced cols of src in a 16 x 16 thread block
+ uint srcIdx = (id_z * srcStridesNH.x);
+ float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads
+
+ if ((id_y >= srcDims[id_z].height) || (id_x >= srcDims[id_z].width))
+ return;
+
+ srcIdx += ((id_y * srcStridesNH.y) + id_x);
+ partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx];
+ __syncthreads(); // syncthreads
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
+ }
+}
+
+// -------------------- Set 2 - to decibels kernels --------------------
+
+__global__ void to_decibels_1d_hip_tensor(float *srcPtr,
+ uint srcStride,
+ float *dstPtr,
+ uint dstStride,
+ RpptImagePatchPtr srcDims,
+ double minRatio,
+ float multiplier,
+ float *inverseMagnitudeTensor)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if (id_x >= srcDims[id_z].height)
+ return;
+
+ uint srcIdx = (id_z * srcStride) + id_x;
+ float inverseMagnitude = inverseMagnitudeTensor[id_z];
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ to_decibels_hip_compute(&src_f8, &dst_f8, minRatio, multiplier, inverseMagnitude);
+
+ uint dstIdx = (id_z * dstStride) + id_x;
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+__global__ void to_decibels_2d_hip_tensor(float *srcPtr,
+ uint2 srcStridesNH,
+ float *dstPtr,
+ uint2 dstStridesNH,
+ RpptImagePatchPtr srcDims,
+ double minRatio,
+ float multiplier,
+ float *inverseMagnitudeTensor)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if (id_x >= srcDims[id_z].width || id_y >= srcDims[id_z].height)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNH.x) + (id_y * srcStridesNH.y) + id_x;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+ float inverseMagnitude = inverseMagnitudeTensor[id_z];
+ dstPtr[dstIdx] = multiplier * log2(max(minRatio, (static_cast(srcPtr[srcIdx]) * inverseMagnitude)));
+}
+
+// -------------------- Set 3 - to decibels kernels executor --------------------
+
+RppStatus hip_exec_to_decibels_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptImagePatchPtr srcDims,
+ Rpp32f cutOffDB,
+ Rpp32f multiplier,
+ Rpp32f referenceMagnitude,
+ rpp::Handle& handle)
+{
+ Rpp32u numDims = srcDescPtr->numDims - 1; // exclude batchSize from input dims
+
+ // Calculate the intermediate values needed for DB conversion
+ Rpp32f minRatio = std::pow(10, cutOffDB / multiplier);
+ if(!minRatio)
+ minRatio = std::nextafter(0.0f, 1.0f);
+ const Rpp32f log10Factor = 0.3010299956639812; //1 / std::log(10);
+ multiplier *= log10Factor;
+
+ // calculate max in input if referenceMagnitude = 0
+ Rpp32f *partialMaxArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ Rpp32s numBlocksPerSample = 0;
+ Rpp32s globalThreads_z = dstDescPtr->n;
+
+ // find the invReferenceMagnitude value
+ bool computeMax = (!referenceMagnitude);
+ if(computeMax)
+ {
+ if (numDims == 1)
+ {
+ numBlocksPerSample = ceil(static_cast((srcDescPtr->strides.nStride + 7) >> 3) / LOCAL_THREADS_X_1DIM);
+ hipLaunchKernelGGL(max_reduction_1d_hip_tensor,
+ dim3(numBlocksPerSample, 1, globalThreads_z),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, 1),
+ srcDims,
+ partialMaxArr);
+ }
+ else if (numDims == 2)
+ {
+ Rpp32s gridDim_x = ceil(static_cast((srcDescPtr->strides.hStride)/LOCAL_THREADS_X));
+ Rpp32s gridDim_y = ceil(static_cast(srcDescPtr->h)/LOCAL_THREADS_Y);
+ Rpp32s gridDim_z = ceil(static_cast(globalThreads_z)/LOCAL_THREADS_Z);
+ numBlocksPerSample = gridDim_x * gridDim_y * gridDim_z;
+ hipLaunchKernelGGL(max_reduction_2d_hip_tensor,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ srcDims,
+ partialMaxArr);
+ }
+ hipStreamSynchronize(handle.GetStream());
+ }
+ Rpp32u blockSize = (computeMax) ? 256: 1;
+ Rpp32f *inverseMagnitudeTensor = partialMaxArr + globalThreads_z * numBlocksPerSample;
+ hipLaunchKernelGGL(inverse_magnitude_hip_tensor,
+ dim3(1, 1, globalThreads_z),
+ dim3(blockSize, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ numBlocksPerSample,
+ computeMax,
+ inverseMagnitudeTensor);
+ hipStreamSynchronize(handle.GetStream());
+
+ // launch kernel for todecibels
+ if (numDims == 1)
+ {
+ Rpp32s globalThreads_x = (srcDescPtr->strides.nStride + 7) >> 3;
+ Rpp32s globalThreads_y = 1;
+ hipLaunchKernelGGL(to_decibels_1d_hip_tensor,
+ dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcDescPtr->strides.nStride,
+ dstPtr,
+ dstDescPtr->strides.nStride,
+ srcDims,
+ static_cast(minRatio),
+ multiplier,
+ inverseMagnitudeTensor);
+ }
+ else if (numDims == 2)
+ {
+ Rpp32s globalThreads_x = srcDescPtr->strides.hStride;
+ Rpp32s globalThreads_y = srcDescPtr->h;
+ hipLaunchKernelGGL(to_decibels_2d_hip_tensor,
+ dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ srcDims,
+ static_cast(minRatio),
+ multiplier,
+ inverseMagnitudeTensor);
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/transpose.hpp b/src/modules/hip/kernel/transpose.hpp
new file mode 100644
index 000000000..83f2ba700
--- /dev/null
+++ b/src/modules/hip/kernel/transpose.hpp
@@ -0,0 +1,105 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// Vectorized dst->src mapping
+template
+__global__ void transpose_generic_hip_tensor(T *srcPtr,
+ uint *srcStrides,
+ T *dstPtr,
+ uint *dstStrides,
+ uint *dstDims,
+ uint tensorDims,
+ uint *permTensor)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+
+ if(id_x >= dstStrides[0])
+ return;
+
+ int maxLength = dstStrides[0];
+ int xDiff = maxLength - (maxLength & ~7); // difference between maxLength and alignedLength. (alignedLength = maxLength & ~7)
+
+ // Point dstIdx and srcIdx to be at the start of given input tensor in batch
+ uint dstIdx = (id_y * *dstStrides++); // post-increment dstStrides pointer by 1 to exclude outermost batch-dimension stride (for example exclude nStride in an NCDHW tensor)
+ uint srcIdx = (id_y * *srcStrides++); // post-increment srcStrides pointer by 1 to exclude outermost batch-dimension stride (for example exclude nStride in an NCDHW tensor)
+
+ d_uint8 dstCoords[RPPT_MAX_DIMS], srcIdxs;
+ uint4 idx0123 = make_uint4(id_x, id_x + 1, id_x + 2, id_x + 3); // get idx for elements 0, 1, 2, 3 in the 8-element vectorized kernel
+ uint4 idx4567 = make_uint4(id_x + 4, id_x + 5, id_x + 6, id_x + 7); // get idx for elements 4, 5, 6, 7 in the 8-element vectorized kernel
+ srcIdxs.ui4[0] = srcIdxs.ui4[1] = make_uint4(srcIdx, srcIdx, srcIdx, srcIdx); // create 8-element vectorized srcIdxs
+
+ // Compute 8 dstCoords given idx0123 and idx4567, corresponding to the 8 srcCoords processed in a thread
+ for (int i = 0; i < tensorDims; i++)
+ {
+ dstCoords[i].ui4[0] = (idx0123 / dstStrides[i]) % dstDims[i]; // transpose 4 srcCoords using idx0123 to 4 dstCoords in dstCoords[i].ui4[0] for the ith tensor dimension
+ dstCoords[i].ui4[1] = (idx4567 / dstStrides[i]) % dstDims[i]; // transpose 4 srcCoords using idx4567 to 4 dstCoords in dstCoords[i].ui4[1] for the ith tensor dimension
+ }
+
+ // Compute corresponding 8 srcIdxs given id_x
+ for (int i = 0; i < tensorDims; i++)
+ {
+ uint4 srcStrides_ui4 = static_cast(srcStrides[permTensor[permTensor[i]]]);
+ srcIdxs.ui4[0] += (dstCoords[permTensor[i]].ui4[0] * srcStrides_ui4); // incrementally adding respective (coordinate value * stride) to get srcIdxs for 0, 1, 2, 3 elements
+ srcIdxs.ui4[1] += (dstCoords[permTensor[i]].ui4[1] * srcStrides_ui4); // incrementally adding respective (coordinate value * stride) to get srcIdxs for 4, 5, 6, 7 elements
+ dstIdx += (dstCoords[i].ui1[0] * dstStrides[i]);
+ }
+
+ // Move srcIdx to access next input tensor once id_x goes beyond present tensor
+ if((id_x + 8) > maxLength)
+ for(int i = xDiff; i < 8; i++)
+ srcIdxs.ui1[i] += maxLength;
+
+ // Load corresponding 8 src pixels from computed src idx values
+ d_float8 dst_f8;
+ dst_f8.f1[0] = static_cast(srcPtr[srcIdxs.ui1[0]]);
+ dst_f8.f1[1] = static_cast(srcPtr[srcIdxs.ui1[1]]);
+ dst_f8.f1[2] = static_cast(srcPtr[srcIdxs.ui1[2]]);
+ dst_f8.f1[3] = static_cast(srcPtr[srcIdxs.ui1[3]]);
+ dst_f8.f1[4] = static_cast(srcPtr[srcIdxs.ui1[4]]);
+ dst_f8.f1[5] = static_cast(srcPtr[srcIdxs.ui1[5]]);
+ dst_f8.f1[6] = static_cast(srcPtr[srcIdxs.ui1[6]]);
+ dst_f8.f1[7] = static_cast(srcPtr[srcIdxs.ui1[7]]);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+RppStatus hip_exec_transpose_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ T *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *permTensor,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ // Check for feasibility of direct copy from input to output if no permutation detected
+ bool copyInput = true;
+ for(int i = 0; i < dstGenericDescPtr->numDims - 1; i++)
+ copyInput *= (permTensor[i] == i);
+
+ if (copyInput)
+ {
+ CHECK_RETURN_STATUS(hipMemcpyAsync(dstPtr, srcPtr, dstGenericDescPtr->dims[0] * dstGenericDescPtr->strides[0] * sizeof(T), hipMemcpyDeviceToDevice, handle.GetStream()));
+ }
+ else
+ {
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3;
+ int globalThreads_y = dstGenericDescPtr->dims[0];
+ int globalThreads_z = 1;
+
+ hipLaunchKernelGGL(transpose_generic_hip_tensor,
+ dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+ dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcGenericDescPtr->strides,
+ dstPtr,
+ dstGenericDescPtr->strides,
+ dstGenericDescPtr->dims + 1,
+ dstGenericDescPtr->numDims - 1,
+ permTensor);
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp
index 8f88ba90f..bac68a4a1 100644
--- a/src/modules/rppt_tensor_arithmetic_operations.cpp
+++ b/src/modules/rppt_tensor_arithmetic_operations.cpp
@@ -255,6 +255,57 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1,
return RPP_SUCCESS;
}
+/******************** log ********************/
+
+RppStatus rppt_log_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -454,4 +505,59 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1,
#endif // backend
}
+/******************** log ********************/
+
+RppStatus rppt_log_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index 0267985e5..c98832f3c 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -22,11 +22,17 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
+#ifdef AUDIO_SUPPORT
+
#include "rppdefs.h"
#include "rppi_validate.hpp"
#include "rppt_tensor_audio_augmentations.h"
#include "cpu/host_tensor_audio_augmentations.hpp"
+#ifdef HIP_COMPILE
+ #include "hip/hip_tensor_audio_augmentations.hpp"
+#endif // HIP_COMPILE
+
/******************** non_silent_region_detection ********************/
RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr,
@@ -268,3 +274,129 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr,
return RPP_ERROR_NOT_IMPLEMENTED;
}
}
+
+/********************************************************************************************************************/
+/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
+/********************************************************************************************************************/
+
+#ifdef GPU_SUPPORT
+
+/******************** non_silent_region_detection ********************/
+
+RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32s *srcLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
+ Rpp32f cutOffDB,
+ Rpp32s windowLength,
+ Rpp32f referencePower,
+ Rpp32s resetInterval,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+
+ return hip_exec_non_silent_region_detection_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ srcLengthTensor,
+ detectedIndexTensor,
+ detectionLengthTensor,
+ cutOffDB,
+ windowLength,
+ referencePower,
+ resetInterval,
+ rpp::deref(rppHandle));
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** to_decibels ********************/
+
+RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptImagePatchPtr srcDims,
+ Rpp32f cutOffDB,
+ Rpp32f multiplier,
+ Rpp32f referenceMagnitude,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims
+ if (tensorDims != 1 && tensorDims != 2)
+ return RPP_ERROR_INVALID_SRC_DIMS;
+
+ if (!multiplier)
+ return RPP_ERROR_ZERO_DIVISION;
+
+ if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_to_decibels_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ static_cast(dstPtr),
+ dstDescPtr,
+ srcDims,
+ cutOffDB,
+ multiplier,
+ referenceMagnitude,
+ rpp::deref(rppHandle));
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** down_mixing ********************/
+
+RppStatus rppt_down_mixing_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32s *srcDimsTensor,
+ bool normalizeWeights,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims
+ if (tensorDims != 1 && tensorDims != 2)
+ return RPP_ERROR_INVALID_SRC_DIMS;
+
+ if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_down_mixing_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ static_cast(dstPtr),
+ dstDescPtr,
+ srcDimsTensor,
+ normalizeWeights,
+ rpp::deref(rppHandle));
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+#endif // GPU_SUPPORT
+#endif // AUDIO_SUPPORT
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_color_augmentations.cpp b/src/modules/rppt_tensor_color_augmentations.cpp
index 3023973fc..e866fe949 100644
--- a/src/modules/rppt_tensor_color_augmentations.cpp
+++ b/src/modules/rppt_tensor_color_augmentations.cpp
@@ -677,7 +677,7 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
RpptDescPtr dstDescPtr,
- Rpp8s *adjustmentValueTensor,
+ Rpp32s *adjustmentValueTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
rppHandle_t rppHandle)
diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp
index 8829a4ee0..8fc2d00ee 100644
--- a/src/modules/rppt_tensor_effects_augmentations.cpp
+++ b/src/modules/rppt_tensor_effects_augmentations.cpp
@@ -868,6 +868,142 @@ RppStatus rppt_ricap_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** glitch ********************/
+
+RppStatus rppt_glitch_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ glitch_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ glitch_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ glitch_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ glitch_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** jitter ********************/
+
+RppStatus rppt_jitter_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ Rpp32u seed,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+ RpptXorwowStateBoxMuller xorwowInitialState[SIMD_FLOAT_VECTOR_LENGTH];
+ rpp_host_rng_xorwow_f32_initialize_multiseed_stream_boxmuller(xorwowInitialState, seed);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ jitter_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ jitter_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ jitter_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ jitter_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -1441,6 +1577,8 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1,
#endif // backend
}
+/******************** water ********************/
+
RppStatus rppt_water_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
@@ -1511,80 +1649,6 @@ RppStatus rppt_water_gpu(RppPtr_t srcPtr,
#endif // backend
}
-/******************** ricap ********************/
-
-RppStatus rppt_ricap_gpu(RppPtr_t srcPtr,
- RpptDescPtr srcDescPtr,
- RppPtr_t dstPtr,
- RpptDescPtr dstDescPtr,
- Rpp32u *permutationTensor,
- RpptROIPtr roiPtrInputCropRegion,
- RpptRoiType roiType,
- rppHandle_t rppHandle)
-{
-#ifdef HIP_COMPILE
- if(srcDescPtr->n == 1) // BatchSize should always be greater than 1
- return RPP_ERROR;
- Rpp32u *permutationHipTensor = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
- CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice));
-
- if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1)
- || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1)
- || (check_roi_out_of_bounds(&roiPtrInputCropRegion[2],srcDescPtr,roiType) == -1)
- || (check_roi_out_of_bounds(&roiPtrInputCropRegion[3],srcDescPtr,roiType) == -1))
- return RPP_ERROR_OUT_OF_BOUND_SRC_ROI;
-
- if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
- {
- hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
- srcDescPtr,
- static_cast(dstPtr) + dstDescPtr->offsetInBytes,
- dstDescPtr,
- permutationHipTensor,
- roiPtrInputCropRegion,
- roiType,
- rpp::deref(rppHandle));
- }
- else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
- {
- hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
- srcDescPtr,
- (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
- dstDescPtr,
- permutationHipTensor,
- roiPtrInputCropRegion,
- roiType,
- rpp::deref(rppHandle));
- }
- else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
- {
- hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
- srcDescPtr,
- (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
- dstDescPtr,
- permutationHipTensor,
- roiPtrInputCropRegion,
- roiType,
- rpp::deref(rppHandle));
- }
- else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
- {
- hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
- srcDescPtr,
- static_cast(dstPtr) + dstDescPtr->offsetInBytes,
- dstDescPtr,
- permutationHipTensor,
- roiPtrInputCropRegion,
- roiType,
- rpp::deref(rppHandle));
- }
-
- return RPP_SUCCESS;
-#elif defined(OCL_COMPILE)
- return RPP_ERROR_NOT_IMPLEMENTED;
-#endif // backend
-}
-
/******************** vignette ********************/
RppStatus rppt_vignette_gpu(RppPtr_t srcPtr,
@@ -1649,6 +1713,8 @@ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** erase ********************/
+
RppStatus rppt_erase_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
@@ -1721,4 +1787,224 @@ RppStatus rppt_erase_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** ricap ********************/
+
+RppStatus rppt_ricap_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *permutationTensor,
+ RpptROIPtr roiPtrInputCropRegion,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if(srcDescPtr->n == 1) // BatchSize should always be greater than 1
+ return RPP_ERROR;
+ Rpp32u *permutationHipTensor = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice));
+
+ if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1)
+ || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1)
+ || (check_roi_out_of_bounds(&roiPtrInputCropRegion[2],srcDescPtr,roiType) == -1)
+ || (check_roi_out_of_bounds(&roiPtrInputCropRegion[3],srcDescPtr,roiType) == -1))
+ return RPP_ERROR_OUT_OF_BOUND_SRC_ROI;
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ permutationHipTensor,
+ roiPtrInputCropRegion,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ permutationHipTensor,
+ roiPtrInputCropRegion,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ permutationHipTensor,
+ roiPtrInputCropRegion,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ permutationHipTensor,
+ roiPtrInputCropRegion,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** glitch ********************/
+
+RppStatus rppt_glitch_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptChannelOffsets *rgbOffsets,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_glitch_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_glitch_tensor(reinterpret_cast((static_cast(srcPtr) + srcDescPtr->offsetInBytes)),
+ srcDescPtr,
+ reinterpret_cast((static_cast(dstPtr) + dstDescPtr->offsetInBytes)),
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_glitch_tensor(reinterpret_cast((static_cast(srcPtr) + srcDescPtr->offsetInBytes)),
+ srcDescPtr,
+ reinterpret_cast((static_cast(dstPtr) + dstDescPtr->offsetInBytes)),
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_glitch_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rgbOffsets,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** jitter ********************/
+
+RppStatus rppt_jitter_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ Rpp32u seed,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+
+ RpptXorwowStateBoxMuller xorwowInitialState;
+ xorwowInitialState.x[0] = 0x75BCD15 + seed;
+ xorwowInitialState.x[1] = 0x159A55E5 + seed;
+ xorwowInitialState.x[2] = 0x1F123BB5 + seed;
+ xorwowInitialState.x[3] = 0x5491333 + seed;
+ xorwowInitialState.x[4] = 0x583F19 + seed;
+ xorwowInitialState.counter = 0x64F0C9 + seed;
+ xorwowInitialState.boxMullerFlag = 0;
+ xorwowInitialState.boxMullerExtra = 0.0f;
+
+ RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr;
+ d_xorwowInitialStatePtr = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice));
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_jitter_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_jitter_tensor((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp
index 6d573ffcc..325881c54 100644
--- a/src/modules/rppt_tensor_geometric_augmentations.cpp
+++ b/src/modules/rppt_tensor_geometric_augmentations.cpp
@@ -1099,7 +1099,6 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr,
layoutParams,
rpp::deref(rppHandle));
}
-
return RPP_SUCCESS;
}
@@ -1301,6 +1300,145 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** lens_correction ********************/
+
+RppStatus rppt_lens_correction_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32f *rowRemapTable,
+ Rpp32f *colRemapTable,
+ RpptDescPtr tableDescPtr,
+ Rpp32f *cameraMatrixTensor,
+ Rpp32f *distortionCoeffsTensor,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+ compute_lens_correction_remap_tables_host_tensor(srcDescPtr,
+ rowRemapTable,
+ colRemapTable,
+ tableDescPtr,
+ cameraMatrixTensor,
+ distortionCoeffsTensor,
+ roiTensorPtrSrc,
+ rpp::deref(rppHandle));
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ remap_bilinear_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rowRemapTable,
+ colRemapTable,
+ tableDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ remap_bilinear_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rowRemapTable,
+ colRemapTable,
+ tableDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ remap_bilinear_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ rowRemapTable,
+ colRemapTable,
+ tableDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ remap_bilinear_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ rowRemapTable,
+ colRemapTable,
+ tableDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** transpose ********************/
+
+RppStatus rppt_transpose_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *permTensor,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8))
+ {
+ transpose_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes,
+ dstGenericDescPtr,
+ permTensor,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ transpose_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ permTensor,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ transpose_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ permTensor,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8))
+ {
+ transpose_generic_host_tensor(static_cast