diff --git a/.Doxyfile b/.Doxyfile index 066a53c02..dac8a3acc 100644 --- a/.Doxyfile +++ b/.Doxyfile @@ -960,16 +960,16 @@ INPUT = README.md \ include/rppi_logical_operations.h \ include/rppi_morphological_transforms.h \ include/rppi_statistical_operations.h \ + include/rppt_tensor_arithmetic_operations.h \ + include/rppt_tensor_audio_augmentations.h \ include/rppt_tensor_color_augmentations.h \ include/rppt_tensor_data_exchange_operations.h \ include/rppt_tensor_effects_augmentations.h \ include/rppt_tensor_filter_augmentations.h \ include/rppt_tensor_geometric_augmentations.h \ + include/rppt_tensor_logical_operations.h \ include/rppt_tensor_morphological_operations.h \ - include/rppt_tensor_statistical_operations.h \ - include/rppt_tensor_arithmetic_operations.h \ - include/rppt_tensor_audio_augmentations.h \ - include/rppt_tensor_logical_operations.h + include/rppt_tensor_statistical_operations.h # This tag can be used to specify the character encoding of the source files @@ -2381,7 +2381,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE +PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml index 3a33cebd7..1e11589cd 100644 --- a/.azuredevops/rocm-ci.yml +++ b/.azuredevops/rocm-ci.yml @@ -13,6 +13,8 @@ trigger: batch: true branches: include: + - develop + - mainline - master paths: exclude: @@ -27,8 +29,9 @@ pr: autoCancel: true branches: include: - - master - develop + - mainline + - master paths: exclude: - .github diff --git a/CHANGELOG.md b/CHANGELOG.md index 16c4251f4..ca19c7eb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ # Changelog for RPP -Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/rpp/en/latest/). +Full documentation for RPP is available at [https://rocm.docs.amd.com/projects/rpp/en/latest](https://rocm.docs.amd.com/projects/rpp/en/latest) -### RPP 1.8.0 (unreleased) +## (Unreleased) RPP 1.8.0 ### Changes @@ -25,7 +25,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * CMake - Version `3.22.3` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -### RPP 1.5.0 +### RPP 1.5.0 for ROCm 6.1.1 ### Changes @@ -42,7 +42,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * CMake - Version `3.22.3` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 1.4.0 +## RPP 1.4.0 for ROCm 6.0.0 ### Additions @@ -76,7 +76,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * CMake - Version `3.22.3` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 1.3.0 +## RPP 1.3.0 for ROCm 5.7.1 ### Additions @@ -106,7 +106,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * Boost - Version `1.72` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 1.2.0 +## RPP 1.2.0 for ROCm 5.7.1 ### Additions @@ -137,7 +137,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * Boost - Version `1.72` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 1.1.0 +## RPP 1.1.0 for ROCm 5.7.0 ### Additions @@ -172,7 +172,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * SLES - the Clang package is missing in the latest updates, which means Clang must be manually installed. -## RPP 1.0.0 +## RPP 1.0.0 for ROCm 5.7.0 ### Additions @@ -212,7 +212,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * SLES - the Clang package is missing in the latest updates, which means Clang must be manually installed. -## RPP 0.99 +## RPP 0.99 for ROCm 5.7.0 ### Additions @@ -241,7 +241,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * Boost - Version `1.72` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 0.98 +## RPP 0.98 for ROCm 5.7.0 ### Additions * Dockers @@ -251,11 +251,11 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * Readme updates -#### Changes +### Changes * CMakeList -#### Fixes +### Fixes * Minor bugs and warnings @@ -270,7 +270,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * Boost - Version `1.72` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 0.97 +## RPP 0.97 for ROCm 5.7.0 ### Additions @@ -301,7 +301,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * Boost - Version `1.72` * IEEE 754-based half-precision floating-point library - Version `1.12.0` -## RPP 0.96 +## RPP 0.96 for ROCm 5.7.0 ### Additions @@ -334,7 +334,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * RPP is not supported on CentOS 7 and SLES SP2 -## RPP 0.95 +## RPP 0.95 for ROCm 5.7.0 ### Additions @@ -368,7 +368,7 @@ Full documentation for RPP is available at (https://rocm.docs.amd.com/projects/r * ROCm reorganization: install updates no longer match ROCm specifications -## RPP 0.93 +## RPP 0.93 for ROCm 5.7.0 ### Additions diff --git a/CMakeLists.txt b/CMakeLists.txt index 7963ff864..df233e5dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,9 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) # RPP Default Options set(DEFAULT_BUILD_TYPE "Release") +### RPP_AUDIO_SUPPORT - default = ON, NOTE: support currently only on Ubuntu - user to set to OFF otherwise +option(RPP_AUDIO_SUPPORT "Build RPP with Audio Support" ON) +option(BUILD_WITH_AMD_ADVANCE "Build RPP for advanced AMD GPU Architecture" OFF) # Set message options if(NOT WIN32) @@ -77,6 +80,7 @@ endif() if(APPLE) set(CMAKE_MACOSX_RPATH 1) set(BACKEND "CPU") + set(RPP_AUDIO_SUPPORT OFF) message("-- ${Magenta}Apple macOS Detected -- GPU Support turned OFF${ColourReset}") endif() @@ -134,9 +138,16 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) find_package(HALF REQUIRED) include_directories(${HALF_INCLUDE_DIRS}) +if (RPP_AUDIO_SUPPORT) + add_definitions(-DAUDIO_SUPPORT) # For compile flags in RPP + set(RPP_AUDIO_AUGMENTATIONS_SUPPORT 1) # For cmakedefine01 in rpp_audio_augmentations_support.h.in +endif() + message("-- ${Cyan}RPP Developer Options${ColourReset}") message("-- ${Cyan} -D BACKEND=${BACKEND} [Select RPP Backend [options:CPU/OPENCL/HIP](default:HIP)]${ColourReset}") message("-- ${Cyan} -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} [Select RPP build type [options:Debug/Release](default:Release)]${ColourReset}") +message("-- ${Cyan} -D RPP_AUDIO_SUPPORT=${RPP_AUDIO_SUPPORT} [Select RPP audio support [options:ON/OFF](default:ON)]${ColourReset}") +message("-- ${Cyan} -D BUILD_WITH_AMD_ADVANCE=${BUILD_WITH_AMD_ADVANCE} [Turn ON/OFF Build for AMD advanced GPUs(default:OFF)]${ColourReset}") # OpenMP find_package(OpenMP REQUIRED) @@ -207,8 +218,23 @@ if("${BACKEND}" STREQUAL "HIP") endif() list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip) + + # Set supported GPU Targets set(DEFAULT_AMDGPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1031;gfx1032;gfx1100;gfx1101;gfx1102") - set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") + if (BUILD_WITH_AMD_ADVANCE) + set(DEFAULT_AMDGPU_TARGETS ${DEFAULT_AMDGPU_TARGETS} "gfx1200;gfx1201") + endif() + + # Set AMDGPU_TARGETS + if(DEFINED ENV{AMDGPU_TARGETS}) + set(AMDGPU_TARGETS $ENV{AMDGPU_TARGETS} CACHE STRING "List of specific machine types for library to target") + elseif(AMDGPU_TARGETS) + message("-- ${White}${PROJECT_NAME} -- AMDGPU_TARGETS set with -D option${ColourReset}") + else() + set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") + endif() + message("-- ${White}${PROJECT_NAME} -- AMDGPU_TARGETS: ${AMDGPU_TARGETS}${ColourReset}") + find_package(HIP QUIET) if(HIP_FOUND) message("-- ${White}${PROJECT_NAME} -- Using HIP - Path:" ${HIP_PATH} "\tVersion:" ${HIP_VERSION} "\tCompiler:" ${HIP_COMPILER}${ColourReset}) @@ -273,6 +299,7 @@ if("${BACKEND}" STREQUAL "CPU") endif() configure_file("${PROJECT_SOURCE_DIR}/include/rpp_backend.h.in" "${PROJECT_BINARY_DIR}/include/rpp_backend.h") +configure_file("${PROJECT_SOURCE_DIR}/include/rpp_audio_augmentations_support.h.in" "${PROJECT_BINARY_DIR}/include/rpp_audio_augmentations_support.h") # Enable SIMD for HOST code (in both OpenCL and HIP backends) if(NOT DEFINED SIMD_ENABLE) @@ -294,7 +321,12 @@ message("-- ${White}${PROJECT_NAME} -- Link Libraries: ${LINK_LIBRARY_LIST}${Col target_link_libraries(${PROJECT_NAME} ${LINK_LIBRARY_LIST}) set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a) +if(RPP_AUDIO_SUPPORT) + target_link_libraries(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/libs/third_party/ffts/libffts.a) + message("-- ${Green}${PROJECT_NAME} set to build with RPP_AUDIO_SUPPORT${ColourReset}") +else() + message("-- ${Yellow}${PROJECT_NAME} set to build without RPP_AUDIO_SUPPORT${ColourReset}") +endif() set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) target_include_directories(${PROJECT_NAME} @@ -335,6 +367,9 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTA install(FILES ${PROJECT_BINARY_DIR}/include/rpp_backend.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rpp COMPONENT dev) +install(FILES ${PROJECT_BINARY_DIR}/include/rpp_audio_augmentations_support.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rpp + COMPONENT dev) # install Test install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/test COMPONENT test) diff --git a/docs/data/doxygenInputs/lens_img640x480.png b/docs/data/doxygenInputs/lens_img640x480.png new file mode 100644 index 000000000..897955d77 Binary files /dev/null and b/docs/data/doxygenInputs/lens_img640x480.png differ diff --git a/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png b/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png new file mode 100644 index 000000000..d4d5b749b Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_glitch_img150x150.png differ diff --git a/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png new file mode 100644 index 000000000..8aef1cbe6 Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png differ diff --git a/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png b/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png new file mode 100644 index 000000000..63a52819d Binary files /dev/null and b/docs/data/doxygenOutputs/geometric_augmentations_lens_correction_img_640x480.png differ diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 18d9a73bc..9773637df 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -962,14 +962,16 @@ INPUT = ../../README.md \ ../../include/rppi_logical_operations.h \ ../../include/rppi_morphological_transforms.h \ ../../include/rppi_statistical_operations.h \ + ../../include/rppt_tensor_arithmetic_operations.h \ + ../../include/rppt_tensor_audio_augmentations.h \ ../../include/rppt_tensor_color_augmentations.h \ ../../include/rppt_tensor_data_exchange_operations.h \ ../../include/rppt_tensor_effects_augmentations.h \ ../../include/rppt_tensor_filter_augmentations.h \ ../../include/rppt_tensor_geometric_augmentations.h \ + ../../include/rppt_tensor_logical_operations.h \ ../../include/rppt_tensor_morphological_operations.h \ - ../../include/rppt_tensor_statistical_operations.h \ - ../../include/rppt_tensor_logical_operations.h + ../../include/rppt_tensor_statistical_operations.h # This tag can be used to specify the character encoding of the source files @@ -2381,7 +2383,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE +PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 221c93045..c316de276 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==1.4.0 +rocm-docs-core[api_reference]==1.5.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 8d0f37727..2c9286b18 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -110,7 +110,7 @@ requests==2.28.2 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.4.0 +rocm-docs-core[api-reference]==1.5.1 # via -r requirements.in smmap==5.0.0 # via gitdb diff --git a/include/rpp_audio_augmentations_support.h.in b/include/rpp_audio_augmentations_support.h.in new file mode 100644 index 000000000..6e8e8c66f --- /dev/null +++ b/include/rpp_audio_augmentations_support.h.in @@ -0,0 +1,6 @@ +#ifndef GUARD_RPP_AUDIO_AUGMENTATIONS_SUPPORT_H_IN +#define GUARD_RPP_AUDIO_AUGMENTATIONS_SUPPORT_H_IN + +#cmakedefine01 RPP_AUDIO_AUGMENTATIONS_SUPPORT + +#endif \ No newline at end of file diff --git a/include/rppdefs.h b/include/rppdefs.h index 28876d7f5..6eb025665 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -64,6 +64,7 @@ SOFTWARE. const float ONE_OVER_6 = 1.0f / 6; const float ONE_OVER_3 = 1.0f / 3; const float ONE_OVER_255 = 1.0f / 255; +const uint MMS_MAX_SCRATCH_MEMORY = 76800000; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training /******************** RPP typedefs ********************/ @@ -136,7 +137,15 @@ typedef enum /*! \brief src and dst layout mismatch \ingroup group_rppdefs */ RPP_ERROR_LAYOUT_MISMATCH = -18, /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ - RPP_ERROR_INVALID_CHANNELS = -19 + RPP_ERROR_INVALID_CHANNELS = -19, + /*! \brief Invalid output tile length (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH = -20, + /*! \brief Shared memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE = -21, + /*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE = -22, + /*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_INVALID_SRC_DIMS = -23 } RppStatus; /*! \brief RPP rppStatus_t type enums @@ -446,6 +455,16 @@ typedef struct } RpptRoiLtrb; +/*! \brief RPPT Tensor Channel Offsets struct + * \ingroup group_rppdefs + */ +typedef struct +{ + RppiPoint r; + RppiPoint g; + RppiPoint b; +} RpptChannelOffsets; + /*! \brief RPPT Tensor 3D ROI LTFRBB struct * \ingroup group_rppdefs */ diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h index d34bdd1dd..d091f50ba 100644 --- a/include/rppt_tensor_arithmetic_operations.h +++ b/include/rppt_tensor_arithmetic_operations.h @@ -47,8 +47,8 @@ extern "C" { * It multiplies each element of the source tensor by a corresponding element in the 'mulTensor', * adds a corresponding element from the 'addTensor', and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HOST memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HOST memory @@ -70,8 +70,8 @@ RppStatus rppt_fused_multiply_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPt * It multiplies each element of the source tensor by a corresponding element in the 'mulTensor', * adds a corresponding element from the 'addTensor', and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_fused_multiply_add_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HIP memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HIP memory @@ -92,8 +92,8 @@ RppStatus rppt_fused_multiply_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr * \details This function performs the addition operation on a batch of 4D tensors. * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_add_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_add_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HOST memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HOST memory @@ -113,8 +113,8 @@ RppStatus rppt_add_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes * \details This function performs the addition operation on a batch of 4D tensors. * It adds a corresponding element from the 'addTensor' to source tensor, and stores the result in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_add_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_add_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HIP memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HIP memory @@ -134,8 +134,8 @@ RppStatus rppt_add_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc * \details This function performs the subtraction operation on a batch of 4D tensors. * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HOST memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HOST memory @@ -155,8 +155,8 @@ RppStatus rppt_subtract_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGener * \details This function performs the subtraction operation on a batch of 4D tensors. * It takes a corresponding element from 'subtractTensor' and subtracts it from source tensor. Result is stored in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_subtract_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HIP memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HIP memory @@ -176,8 +176,8 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri * \details This function performs the multiplication operation on a batch of 4D tensors. * It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HOST memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HOST memory @@ -190,15 +190,15 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT /*! \brief Multiply scalar augmentation on HIP backend * \details This function performs the multiplication operation on a batch of 4D tensors. * It takes a corresponding element from 'multiplyTensor' and multiplies it with source tensor. Result is stored in the destination tensor. * Support added for f32 -> f32 dataype. - * \image html input150x150x4.gif Sample Input - * \image html arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/arithmetic_operations_multiply_scalar_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HIP memory * \param[in] srcGenericDescPtr source tensor descriptor * \param[out] dstPtr destination tensor in HIP memory @@ -226,7 +226,7 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -248,7 +248,7 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -258,6 +258,40 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Logarithm operation on HOST backend + * \details Computes Log to base e(natural log) of the input for a given ND Tensor. + * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. + * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Logarithm operation on HIP backend + * \details Computes Log to base e(natural log) of the input for a given ND Tensor. + * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. + * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index f6349ae95..db52b073f 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -25,6 +25,8 @@ SOFTWARE. #ifndef RPPT_TENSOR_AUDIO_AUGMENTATIONS_H #define RPPT_TENSOR_AUDIO_AUGMENTATIONS_H +#ifdef AUDIO_SUPPORT + #include "rpp.h" #include "rppdefs.h" #ifdef __cplusplus @@ -46,49 +48,90 @@ extern "C" { * \details Non Silent Region Detection augmentation for 1D audio buffer \n Finds the starting index and length of non silent region in the audio buffer by comparing the calculated short-term power with cutoff value passed - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) - * \param[in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize) - * \param[out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize) - * \param[out] detectionLengthTensor length of non silent region (1D tensor in HOST memory, of size batchSize) - * \param[in] cutOffDB cutOff in dB below which the signal is considered silent - * \param[in] windowLength window length used for computing short-term power of the signal - * \param[in] referencePower reference power that is used to convert the signal to dB - * \param[in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss - * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize) + * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize) + * \param [out] detectionLengthTensor length of non silent region (1D tensor in HOST memory, of size batchSize) + * \param [in] cutOffDB cutOff in dB below which the signal is considered silent + * \param [in] windowLength window length used for computing short-term power of the signal + * \param [in] referencePower reference power that is used to convert the signal to dB + * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); +#ifdef GPU_SUPPORT +/*! \brief Non Silent Region Detection augmentation on HIP backend + * \details Non Silent Region Detection augmentation for 1D audio buffer + \n Finds the starting index and length of non silent region in the audio buffer by comparing the + calculated short-term power with cutoff value passed + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [in] srcLengthTensor source audio buffer length (1D tensor in Pinned/HIP memory, of size batchSize) + * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in Pinned/HIP memory, of size batchSize) + * \param [out] detectionLengthTensor length of non silent region (1D tensor in Pinned/HIP memory, of size batchSize) + * \param [in] cutOffDB cutOff in dB below which the signal is considered silent + * \param [in] windowLength window length used for computing short-term power of the signal + * \param [in] referencePower reference power that is used to convert the signal to dB + * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief To Decibels augmentation on HOST backend - * \details To Decibels augmentation for 1D audio buffer converts magnitude values to decibel values - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) - * \param[out] dstPtr destination tensor in HOST memory - * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) - * \param[in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2) - * \param[in] cutOffDB minimum or cut-off ratio in dB - * \param[in] multiplier factor by which the logarithm is multiplied - * \param[in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference - * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32) + * \param [in] srcDims source tensor sizes for each element in batch (2D tensor in HOST memory, of size batchSize * 2) + * \param [in] cutOffDB minimum or cut-off ratio in dB + * \param [in] multiplier factor by which the logarithm is multiplied + * \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ RppStatus rppt_to_decibels_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle); +#ifdef GPU_SUPPORT +/*! \brief To Decibels augmentation on HIP backend + * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel/2D audio tensor with 1 channel), offsetInBytes >= 0, dataType = F32) + * \param [in] srcDims source tensor sizes for each element in batch (2D tensor in Pinned/HIP memory, of size batchSize * 2) + * \param [in] cutOffDB minimum or cut-off ratio in dB + * \param [in] multiplier factor by which the logarithm is multiplied + * \param [in] referenceMagnitude Reference magnitude if not provided maximum value of input used as reference + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptImagePatchPtr srcDims, Rpp32f cutOffDB, Rpp32f multiplier, Rpp32f referenceMagnitude, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Pre Emphasis Filter augmentation on HOST backend * \details Pre Emphasis Filter augmentation for audio data - * \param[in] srcPtr source tensor in HOST memory - * \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) - * \param[out] dstPtr destination tensor in HOST memory - * \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) - * \param[in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize) - * \param[in] coeffTensor preemphasis coefficient (1D tensor in HOST memory, of size batchSize) - * \param[in] borderType border value policy - * \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize) + * \param [in] coeffTensor preemphasis coefficient (1D tensor in HOST memory, of size batchSize) + * \param [in] borderType border value policy + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. @@ -97,19 +140,36 @@ RppStatus rppt_pre_emphasis_filter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, /*! \brief Down Mixing augmentation on HOST backend * \details Down Mixing augmentation for audio data -* \param[in] srcPtr source tensor in HOST memory -* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) -* \param[out] dstPtr destination tensor in HOST memory -* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) -* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) -* \param[in] normalizeWeights bool flag to specify if normalization of weights is needed -* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() +* \param [in] srcPtr source tensor in HOST memory +* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32) +* \param [out] dstPtr destination tensor in HOST memory +* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32) +* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) +* \param [in] normalizeWeights bool flag to specify if normalization of weights is needed +* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ RppStatus rppt_down_mixing_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle); +#ifdef GPU_SUPPORT +/*! \brief Down Mixing augmentation on HIP backend +* \details Down Mixing augmentation for audio data +* \param [in] srcPtr source tensor in HIP memory +* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32) +* \param [out] dstPtr destination tensor in HIP memory +* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32) +* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HIP/Pinned memory, of size batchSize * 2) +* \param [in] normalizeWeights bool flag to specify if normalization of weights is needed +* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() +* \return A \ref RppStatus enumeration. +* \retval RPP_SUCCESS Successful completion. +* \retval RPP_ERROR* Unsuccessful completion. +*/ +RppStatus rppt_down_mixing_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *srcDimsTensor, bool normalizeWeights, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Produces a spectrogram from a 1D audio buffer on HOST backend * \details Spectrogram for 1D audio buffer * \param [in] srcPtr source tensor in HOST memory @@ -153,15 +213,15 @@ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp /*! \brief Resample augmentation on HOST backend * \details Resample augmentation for audio data -* \param[in] srcPtr source tensor in HOST memory -* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) -* \param[out] dstPtr destination tensor in HOST memory -* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) -* \param[in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize) -* \param[in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize) -* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) -* \param[in] window Resampling window (struct of type RpptRpptResamplingWindow) -* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() +* \param [in] srcPtr source tensor in HOST memory +* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param [out] dstPtr destination tensor in HOST memory +* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param [in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize) +* \param [in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize) +* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) +* \param [in] window Resampling window (struct of type RpptRpptResamplingWindow) +* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. @@ -174,4 +234,7 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d #ifdef __cplusplus } #endif + +#endif // AUDIO_SUPPORT + #endif // RPPT_TENSOR_AUDIO_AUGMENTATIONS_H diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h index b01a12dca..62ef13715 100644 --- a/include/rppt_tensor_color_augmentations.h +++ b/include/rppt_tensor_color_augmentations.h @@ -54,7 +54,7 @@ extern "C" { * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch) * \param [in] betaTensor beta values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -76,7 +76,7 @@ RppStatus rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch) * \param [in] betaTensor beta values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -97,7 +97,7 @@ RppStatus rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in HOST memory, of size batchSize with gamma >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -118,7 +118,7 @@ RppStatus rppt_gamma_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rp * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in pinned/HOST memory, of size batchSize with gamma >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -141,7 +141,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for alpha-blending (1D tensor in HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -164,7 +164,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for alpha-blending (1D tensor in pinned/HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -188,7 +188,7 @@ RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc * \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch) * \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch) * \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -212,7 +212,7 @@ RppStatus rppt_color_twist_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ * \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch) * \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch) * \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -236,7 +236,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] contrastTensor contrast modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch) * \param [in] hueTensor hue modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch) * \param [in] saturationTensor saturation modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -257,7 +257,7 @@ RppStatus rppt_color_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch) * \param [in] alphaTensor alpha values for color casting calculation (1D tensor in HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -279,7 +279,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in pinned/HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch) * \param [in] alphaTensor alpha values for color casting calculation (1D tensor in pinned/HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -300,7 +300,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -321,7 +321,7 @@ RppStatus rppt_exposure_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in pinned/HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -343,7 +343,7 @@ RppStatus rppt_exposure_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch)) * \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -365,7 +365,7 @@ RppStatus rppt_contrast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch)) * \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -386,7 +386,7 @@ RppStatus rppt_contrast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] lutPtr lut Array in HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -407,7 +407,7 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] lutPtr lut Array in pinned/HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -427,15 +427,15 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp8s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT /*! \brief Color Temperature augmentation on HIP backend for a NCHW/NHWC layout tensor @@ -448,8 +448,8 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size sizeof(Rpp8s) * batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h index 708f318bf..a4c2b41ba 100644 --- a/include/rppt_tensor_effects_augmentations.h +++ b/include/rppt_tensor_effects_augmentations.h @@ -56,7 +56,7 @@ extern "C" { * \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch) * \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch) * \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -80,7 +80,7 @@ RppStatus rppt_gridmask_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch) * \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch) * \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -103,7 +103,7 @@ RppStatus rppt_gridmask_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -126,7 +126,7 @@ RppStatus rppt_spatter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -151,7 +151,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] saltValueTensor A user-defined salt noise value (1D tensor in HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch) * \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -176,7 +176,7 @@ RppStatus rppt_salt_and_pepper_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt * \param [in] saltValueTensor A user-defined salt noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch) * \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -198,7 +198,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -220,7 +220,7 @@ RppStatus rppt_shot_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in pinned/HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -243,7 +243,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -266,7 +266,7 @@ RppStatus rppt_gaussian_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP * \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -289,7 +289,7 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -312,7 +312,7 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -338,7 +338,7 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDesc * \param[in] freqYTensor freqY values for water effect (1D tensor in HOST memory, of size batchSize) * \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize) * \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -364,7 +364,7 @@ RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param[in] freqYTensor freqY values for water effect (1D tensor in pinned/HOST memory, of size batchSize) * \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize) * \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -433,7 +433,7 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -455,7 +455,7 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -465,11 +465,55 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *vignetteIntensityTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/******************** jitter ********************/ + +/*! \brief Jitter augmentation on HOST backend for a NCHW/NHWC layout tensor + * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input + * \image html effects_augmentations_jitter_img150x150.png Sample Output + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Jitter augmentation on HIP backend for a NCHW/NHWC layout tensor + * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input + * \image html effects_augmentations_jitter_img150x150.png Sample Output + * \param [in] srcPtr source tensor in HIP memory + * \param un[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Gaussian noise augmentation on HOST backend * \details This function adds gaussian noise to a batch of 4D tensors. * Support added for u8 -> u8, f32 -> f32 datatypes. - * \image html input150x150x4.gif Sample Input - * \image html effects_augmentations_gaussian_noise_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/effects_augmentations_gaussian_noise_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HOST memory * \param [in] srcGenericDescPtr source tensor descriptor * \param [out] dstPtr destination tensor in HOST memory @@ -490,8 +534,8 @@ RppStatus rppt_gaussian_noise_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr src /*! \brief Gaussian noise augmentation on HIP backend * \details This function adds gaussian noise to a batch of 4D tensors. * Support added for u8 -> u8, f32 -> f32 datatypes. - * \image html input150x150x4.gif Sample Input - * \image html effects_augmentations_gaussian_noise_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/effects_augmentations_gaussian_noise_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HIP memory * \param [in] srcGenericDescPtr source tensor descriptor * \param [out] dstPtr destination tensor in HIP memory @@ -524,7 +568,7 @@ RppStatus rppt_gaussian_noise_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcD - Erase-region anchor boxes on each image given by the user must not overlap * \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr) * \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -549,7 +593,7 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP - Erase-region anchor boxes on each image given by the user must not overlap * \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr) * \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -559,6 +603,50 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptRoiLtrb *anchorBoxInfoTensor, RppPtr_t colorsTensor, Rpp32u *numBoxesTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Glitch augmentation on HOST backend for a NCHW/NHWC layout tensor + * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input + * \image html effects_augmentations_glitch_img150x150.jpg Sample Output + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A single set of 3 Rppi point values that applies to all images in the batch. + * For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height) + * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Glitch augmentation on HIP backend for a NCHW/NHWC layout tensor + * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.jpg Sample Input + * \image html effects_augmentations_glitch_img150x150.jpg Sample Output + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A 1D tensor in pinned/HOST memory contains single set of 3 Rppi point values that applies to all images in the batch. + * For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height) + * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/include/rppt_tensor_filter_augmentations.h b/include/rppt_tensor_filter_augmentations.h index 7ea8d00c6..992631c49 100644 --- a/include/rppt_tensor_filter_augmentations.h +++ b/include/rppt_tensor_filter_augmentations.h @@ -57,7 +57,7 @@ extern "C" { * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -83,7 +83,7 @@ RppStatus rppt_box_filter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] stdDevTensor stdDev values for gaussian calculation (1D tensor in pinned/HOST memory, of size batchSize, for each image in batch) * \param [in] kernelSize kernel size for gaussian filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h index a3e6d2d7f..28dd516e6 100644 --- a/include/rppt_tensor_geometric_augmentations.h +++ b/include/rppt_tensor_geometric_augmentations.h @@ -52,7 +52,7 @@ extern "C" { * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -72,7 +72,7 @@ RppStatus rppt_crop_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -95,7 +95,7 @@ RppStatus rppt_crop_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr * \param [in] offsetTensor offset values for normalization (1D tensor in HOST memory, of size batchSize, with offsetTensor[n] <= 0) * \param [in] multiplierTensor multiplier values for normalization (1D tensor in HOST memory, of size batchSize, with multiplierTensor[n] > 0) * \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -118,7 +118,7 @@ RppStatus rppt_crop_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt * \param [in] offsetTensor offset values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with offsetTensor[n] <= 0) * \param [in] multiplierTensor multiplier values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with multiplierTensor[n] > 0) * \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -140,7 +140,7 @@ RppStatus rppt_crop_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in HOST memory, of size batchSize * 6 for each image in batch) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -162,7 +162,7 @@ RppStatus rppt_warp_affine_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in pinned/HOST memory, of size batchSize * 6 for each image in batch) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -184,7 +184,7 @@ RppStatus rppt_warp_affine_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -206,7 +206,7 @@ RppStatus rppt_flip_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -228,7 +228,7 @@ RppStatus rppt_flip_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -250,7 +250,7 @@ RppStatus rppt_resize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -275,7 +275,7 @@ RppStatus rppt_resize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)) * \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images) * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -300,7 +300,7 @@ RppStatus rppt_resize_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDesc * \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)) * \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images) * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -323,7 +323,7 @@ RppStatus rppt_resize_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescP * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -346,7 +346,7 @@ RppStatus rppt_resize_crop_mirror_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -368,7 +368,7 @@ RppStatus rppt_resize_crop_mirror_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -390,7 +390,7 @@ RppStatus rppt_rotate_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in pinned/HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -412,7 +412,7 @@ RppStatus rppt_rotate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -434,7 +434,7 @@ RppStatus rppt_phase_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) @@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) @@ -541,8 +541,8 @@ RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPt /*! \brief Flip voxel augmentation HOST * \details The flip voxel augmentation performs a mask-controlled horizontal/vertical/depth flip on a generic 4D tensor.
Support added for f32 -> f32 and u8 -> u8 dataypes. - * \image html input150x150x4.gif Sample Input - * \image html geometric_augmentations_flip_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/geometric_augmentations_flip_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HOST memory * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory @@ -564,8 +564,8 @@ RppStatus rppt_flip_voxel_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDes /*! \brief Flip voxel augmentation GPU * \details The flip voxel augmentation performs a mask-controlled horizontal/vertical/depth flip on a generic 4D tensor.
Support added for f32 -> f32 and u8 -> u8 dataypes. - * \image html input150x150x4.gif Sample Input - * \image html geometric_augmentations_flip_150x150x4.gif Sample Output + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenInputs/input150x150x4.gif Sample Input + * \image html https://raw.githubusercontent.com/ROCm/rpp/develop/docs/data/doxygenOutputs/geometric_augmentations_flip_150x150x4.gif Sample Output * \param [in] srcPtr source tensor in HIP memory * \param [in] srcGenericDescPtr source tensor descriptor (Restrictions - numDims = 5, offsetInBytes >= 0, dataType = U8/F32, layout = NCDHW/NDHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory @@ -598,7 +598,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc * \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc) * \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -623,7 +623,7 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc) * \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -634,6 +634,94 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP RppStatus rppt_remap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, RpptInterpolationType interpolationType, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Lens correction transformation on HOST backend for a NCHW/NHWC layout tensor + * \details Performs lens correction transforms on an image to compensate barrel lens distortion of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * Note: Returns a black image if the passed camera matrix has a 0 determinant + * \image html lens_img640x480.png Sample Input + * \image html geometric_augmentations_lens_correction_img_640x480.png Sample Output + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] rowRemapTable Rpp32f row numbers in HOST memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize) + * \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize) + * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) + * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize) + * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + * \ingroup group_tensor_geometric + */ +RppStatus rppt_lens_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, Rpp32f *cameraMatrixTensor, Rpp32f *distortionCoeffsTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Lens correction transformation on HIP backend for a NCHW/NHWC layout tensor + * \details Performs lens correction transforms on an image to compensate barrel lens distortion of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * Note: Returns a black image if the passed camera matrix has a 0 determinant + * \image html lens_img640x480.png Sample Input + * \image html geometric_augmentations_lens_correction_img_640x480.png Sample Output + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] rowRemapTable Rpp32f row numbers in HIP memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize) + * \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (1D tensor of size width * height * batchSize) + * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) + * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize) + * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + * \ingroup group_tensor_geometric + */ +RppStatus rppt_lens_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *rowRemapTable, Rpp32f *colRemapTable, RpptDescPtr tableDescPtr, Rpp32f *cameraMatrixTensor, Rpp32f *distortionCoeffsTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + +/*! \brief Transpose Generic augmentation on HOST backend + * \details The transpose augmentation performs an input-permutation based transpose on a generic ND Tensor. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr source tensor in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] permTensor permutation tensor for transpose operation + * \param [in] roiTensor ROI data for each element in source tensor (tensor of batchSize * number of dimensions * 2 values) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + * \ingroup group_tensor_geometric + */ +RppStatus rppt_transpose_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *permTensor, Rpp32u *roiTensor, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Transpose Generic augmentation on HIP backend + * \details The transpose augmentation performs an input-permutation based transpose on a generic ND Tensor. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr source tensor in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] permTensor permutation tensor for transpose operation in pinned memory + * \param [in] roiTensor ROI data for each element in source tensor (tensor of batchSize * number of dimensions * 2 values) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + * \ingroup group_tensor_geometric + */ +RppStatus rppt_transpose_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *permTensor, Rpp32u *roiTensor, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index 3a4685167..28dff69ce 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -54,7 +54,7 @@ extern "C" { * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -76,7 +76,7 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -98,7 +98,7 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -120,7 +120,7 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -136,4 +136,4 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H diff --git a/include/rppt_tensor_morphological_operations.h b/include/rppt_tensor_morphological_operations.h index eb879af5c..126c4757a 100644 --- a/include/rppt_tensor_morphological_operations.h +++ b/include/rppt_tensor_morphological_operations.h @@ -57,7 +57,7 @@ extern "C" { * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -82,7 +82,7 @@ RppStatus rppt_erode_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -98,4 +98,4 @@ RppStatus rppt_dilate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 441816ea3..ca464340b 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -50,7 +50,7 @@ extern "C" { * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorSumArr destination array in HOST memory * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -68,7 +68,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorSumArr destination array in HIP memory * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -86,7 +86,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] minArr destination array in HOST memory * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -104,7 +104,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] minArr destination array in HIP memory * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -122,7 +122,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] maxArr destination array in HOST memory * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -140,7 +140,7 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] maxArr destination array in HIP memory * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -201,7 +201,7 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorMeanArr destination array in HOST memory * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -219,7 +219,7 @@ RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorMeanArr destination array in HIP memory * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -238,7 +238,7 @@ RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [out] tensorStddevArr destination array in HOST memory * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4) * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -257,7 +257,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt * \param [out] tensorStddevArr destination array in HIP memory * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4) * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -273,4 +273,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index bb06713b9..be8eaeeaa 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -177,6 +177,21 @@ struct RPPTensorFunctionMetaData }; #endif // GPU_SUPPORT +// Computes strides for ND Tensor +inline void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim) +{ + if (tensorDim > 0) + { + Rpp32u v = 1; + for (Rpp32u i = tensorDim - 1; i > 0; i--) + { + strides[i] = v; + v *= shape[i]; + } + strides[0] = v; + } +} + // Uses fast inverse square root algorithm from Lomont, C., 2003. FAST INVERSE SQUARE ROOT. [online] lomont.org. Available at: inline float rpp_host_math_inverse_sqrt_1(float x) { @@ -6096,6 +6111,25 @@ inline void compute_separable_horizontal_resample(Rpp32f *inputPtr, T *outputPtr } } +inline void compute_jitter_src_loc_avx(__m256i *pxXorwowStateX, __m256i *pxXorwowStateCounter, __m256 &pRow, __m256 &pCol, __m256 &pKernelSize, __m256 &pBound, __m256 &pHeightLimit, __m256 &pWidthLimit, __m256 &pStride, __m256 &pChannel, Rpp32s *srcLoc) +{ + __m256 pRngX = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter); + __m256 pRngY = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter); + __m256 pX = _mm256_mul_ps(pRngX, pKernelSize); + __m256 pY = _mm256_mul_ps(pRngY, pKernelSize); + pX = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pRow, _mm256_sub_ps(pX, pBound))), pHeightLimit), avx_p0); + pY = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pCol, _mm256_sub_ps(pY, pBound))), pWidthLimit), avx_p0); + __m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pX, pStride, _mm256_mul_ps(pY, pChannel))); + _mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc); +} + +inline void compute_jitter_src_loc(RpptXorwowStateBoxMuller *xorwowState, Rpp32s row, Rpp32s col, Rpp32s kSize, Rpp32s heightLimit, Rpp32s widthLimit, Rpp32s stride, Rpp32s bound, Rpp32s channels, Rpp32s &loc) +{ + Rpp32u heightIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize; + Rpp32u widthIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize; + loc = std::max(std::min(static_cast(row + heightIncrement - bound), heightLimit), 0) * stride; + loc += std::max(std::min(static_cast(col + widthIncrement - bound), (widthLimit - 1)), 0) * channels; +} inline void compute_sum_16_host(__m256i *p, __m256i *pSum) { pSum[0] = _mm256_add_epi32(_mm256_add_epi32(p[0], p[1]), pSum[0]); //add 16 values to 8 @@ -6501,4 +6535,32 @@ inline void compute_remap_src_loc(Rpp32f rowLoc, Rpp32f colLoc, Rpp32s &srcLoc, srcLoc = (rowLoc * stride) + colLoc * channels; } +inline void compute_log_16_host(__m256 *p) +{ + p[0] = log_ps(p[0]); // log compute + p[1] = log_ps(p[1]); // log compute +} + +inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst) +{ + __m256 tmp0, tmp1, tmp2, tmp3; + tmp0 = _mm256_shuffle_ps(pSrc[0], pSrc[1], 0x44); /* shuffle to get [P01|P02|P09|P10|P05|P06|P13|P14] */ + tmp2 = _mm256_shuffle_ps(pSrc[0], pSrc[1], 0xEE); /* shuffle to get [P03|P04|P11|P12|P07|P08|P15|P16] */ + tmp1 = _mm256_shuffle_ps(pSrc[2], pSrc[3], 0x44); /* shuffle to get [P17|P18|P25|P26|P21|P22|P29|P30] */ + tmp3 = _mm256_shuffle_ps(pSrc[2], pSrc[3], 0xEE); /* shuffle to get [P19|P20|P27|P28|P23|P24|P31|P32] */ + pSrc[0] = _mm256_shuffle_ps(tmp0, tmp1, 0x88); /* shuffle to get [P01|P09|P17|P25|P05|P13|P21|P29] */ + pSrc[1] = _mm256_shuffle_ps(tmp0, tmp1, 0xDD); /* shuffle to get [P02|P10|P18|P26|P06|P14|P22|P30] */ + pSrc[2] = _mm256_shuffle_ps(tmp2, tmp3, 0x88); /* shuffle to get [P03|P11|P19|P27|P07|P15|P23|P31] */ + pSrc[3] = _mm256_shuffle_ps(tmp2, tmp3, 0xDD); /* shuffle to get [P04|P12|P20|P28|P08|P16|P24|P32] */ + + pDst[0] = _mm256_castps256_ps128(pSrc[0]); /* extract [P01|P09|P17|P25] */ + pDst[1] = _mm256_castps256_ps128(pSrc[1]); /* extract [P02|P10|P18|P26] */ + pDst[2] = _mm256_castps256_ps128(pSrc[2]); /* extract [P03|P11|P19|P27] */ + pDst[3] = _mm256_castps256_ps128(pSrc[3]); /* extract [P04|P12|P20|P28] */ + pDst[4] = _mm256_extractf128_ps(pSrc[0], 1); /* extract [P05|P13|P21|P29] */ + pDst[5] = _mm256_extractf128_ps(pSrc[1], 1); /* extract [P06|P14|P22|P30] */ + pDst[6] = _mm256_extractf128_ps(pSrc[2], 1); /* extract [P07|P15|P23|P31] */ + pDst[7] = _mm256_extractf128_ps(pSrc[3], 1); /* extract [P08|P16|P24|P32] */ +} + #endif //RPP_CPU_COMMON_H diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index babc6f55c..b9e79c146 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -185,6 +185,10 @@ const __m256i avx_pxShufflePkd = _mm256_setr_m128(xmm_pxStore4Pkd, xmm_pxStore4P const __m128i xmm_pxMask00 = _mm_setr_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0, 1, 2, 3); const __m128i xmm_pxMask04To11 = _mm_setr_epi8(4, 5, 6, 7, 8, 9, 10, 11, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +const __m256i avx_pxMaskR = _mm256_setr_epi8(0, 0x80, 0x80, 3, 0x80, 0x80, 6, 0x80, 0x80, 9, 0x80, 0x80, 12, 0x80, 0x80, 15, 0x80, 0x80, 18, 0x80, 0x80, 21, 0x80, 0x80, 24, 0x80, 0x80, 27, 0x80, 0x80, 0x80, 0x80); +const __m256i avx_pxMaskG = _mm256_setr_epi8(0x80, 1, 0x80, 0x80, 4, 0x80, 0x80, 7, 0x80, 0x80, 10, 0x80, 0x80, 13, 0x80, 0x80, 16, 0x80, 0x80, 19, 0x80, 0x80, 22, 0x80, 0x80, 25, 0x80, 0x80, 28, 0x80, 0x80, 0x80); +const __m256i avx_pxMaskB = _mm256_setr_epi8(0x80, 0x80, 2, 0x80, 0x80, 5, 0x80, 0x80, 8, 0x80, 0x80, 11, 0x80, 0x80, 14, 0x80, 0x80, 17, 0x80, 0x80, 20, 0x80, 0x80, 23, 0x80, 0x80, 26, 0x80, 0x80, 29, 0x80, 0x80); + // Print helpers inline void rpp_mm_print_epi8(__m128i vPrintArray) @@ -1021,6 +1025,99 @@ inline void rpp_load48_u8pkd3_to_f32pln3_avx(Rpp8u *srcPtr, __m256 *p) p[5] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMaskB), _mm_shuffle_epi8(px[3], xmm_pxMaskB))); /* Contains B09-16 */ } +inline void rpp_glitch_load24_u8pkd3_to_f32pln3_avx(Rpp8u *srcPtr, __m256 *p, int *srcLocs) +{ + __m128i px[2]; + px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0])); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need R01-04 */ + px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need R05-08 */ + p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskR), _mm_shuffle_epi8(px[1], xmm_pxMaskR))); /* Contains R01-08 */ + + px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1])); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need G01-04 */ + px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need G05-08 */ + p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskG), _mm_shuffle_epi8(px[1], xmm_pxMaskG))); /* Contains G01-08 */ + + px[0] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2])); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need B01-04 */ + px[1] = _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2] + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need B05-08 */ + p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskB), _mm_shuffle_epi8(px[1], xmm_pxMaskB))); /* Contains B01-08 */ +} + +inline void rpp_glitch_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p, int *srcLocs) +{ + __m128 p128[8]; + Rpp32f *srcPtrTemp = srcPtr + srcLocs[0]; + p[0] = _mm256_setr_ps(*srcPtrTemp, *(srcPtrTemp + 3), *(srcPtrTemp + 6), *(srcPtrTemp + 9), + *(srcPtrTemp + 12), *(srcPtrTemp + 15), *(srcPtrTemp + 18), *(srcPtrTemp + 21)); + srcPtrTemp = srcPtr + srcLocs[1]; + p[1] = _mm256_setr_ps(*(srcPtrTemp + 1), *(srcPtrTemp + 4), *(srcPtrTemp + 7), *(srcPtrTemp + 10), + *(srcPtrTemp + 13), *(srcPtrTemp + 16), *(srcPtrTemp + 19), *(srcPtrTemp + 22)); + srcPtrTemp = srcPtr + srcLocs[2]; + p[2] = _mm256_setr_ps(*(srcPtrTemp + 2), *(srcPtrTemp + 5), *(srcPtrTemp + 8), *(srcPtrTemp + 11), + *(srcPtrTemp + 14), *(srcPtrTemp + 17), *(srcPtrTemp + 20), *(srcPtrTemp + 23)); +} + +inline void rpp_glitch_load24_i8pkd3_to_f32pln3_avx(Rpp8s *srcPtr, __m256 *p, int *srcLocs) +{ + __m128i px[2]; + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0]))); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need R01-04 */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[0] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need R05-08 */ + p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskR), _mm_shuffle_epi8(px[1], xmm_pxMaskR))); /* Contains R01-08 */ + + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1]))); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need G01-04 */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[1] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need G05-08 */ + p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskG), _mm_shuffle_epi8(px[1], xmm_pxMaskG))); /* Contains G01-08 */ + + px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2]))); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need B01-04 */ + px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)(srcPtr + srcLocs[2] + 12))); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need B05-08 */ + p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMaskB), _mm_shuffle_epi8(px[1], xmm_pxMaskB))); /* Contains B01-08 */ +} + +inline void rpp_glitch_load30_u8pkd3_to_u8pkd3_avx(Rpp8u *srcPtr, int *srcLocs, __m256i &p) +{ + __m256i px[3]; + px[0] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[0])); // Load the source location1 values passed + px[1] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[1])); // Load the source location2 values passed + px[2] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[2])); // Load the source location3 values passed + px[0] = _mm256_shuffle_epi8(px[0], avx_pxMaskR); /* Shuffle to obtain R channel values */ + px[1] = _mm256_shuffle_epi8(px[1], avx_pxMaskG); /* Shuffle to obtain G channel values */ + px[2] = _mm256_shuffle_epi8(px[2], avx_pxMaskB); /* Shuffle to obtain B channel values */ + px[0] = _mm256_or_si256(px[0], px[1]); /* Pack R and G channels to obtain RG format */ + p = _mm256_or_si256(px[0], px[2]); /* Pack RG values and B channel to obtain RGB format */ +} + +inline void rpp_glitch_load30_i8pkd3_to_i8pkd3_avx(Rpp8s *srcPtr, int * srcLocs, __m256i &p) +{ + __m256i px[3]; + px[0] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[0])); // Load the source location1 values passed + px[1] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[1])); // Load the source location2 values passed + px[2] = _mm256_loadu_si256((__m256i *)(srcPtr + srcLocs[2])); // Load the source location3 values passed + px[0] = _mm256_shuffle_epi8(px[0], avx_pxMaskR); /* Shuffle to obtain R channel values */ + px[1] = _mm256_shuffle_epi8(px[1], avx_pxMaskG); /* Shuffle to obtain G channel values */ + px[2] = _mm256_shuffle_epi8(px[2], avx_pxMaskB); /* Shuffle to obtain B channel values */ + px[0] = _mm256_or_si256(px[0], px[1]); /* Pack R and G channels to obtain RG format */ + p = _mm256_or_si256(px[0], px[2]); /* Pack RG values and B channel to obtain RGB format */ +} + +inline void rpp_glitch_load6_f32pkd3_to_f32pkd3_avx(Rpp32f *srcPtr, int * srcLocs, __m256 &p) +{ + p =_mm256_setr_ps(*(srcPtr + srcLocs[0]), *(srcPtr + srcLocs[1] + 1), *(srcPtr + srcLocs[2] + 2), *(srcPtr + srcLocs[0] + 3), + *(srcPtr + srcLocs[1] + 4), *(srcPtr + srcLocs[2] + 5), 0.0f, 0.0f); +} + +inline void rpp_glitch_load48_u8pln3_to_f32pln3_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256 *p, int *srcLocs) +{ + __m128i px[3]; + + px[0] = _mm_loadu_si128((__m128i *)srcPtrR + srcLocs[0]); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm_loadu_si128((__m128i *)srcPtrG + srcLocs[1]); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm_loadu_si128((__m128i *)srcPtrB + srcLocs[2]); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + p[0] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMask00To03), _mm_shuffle_epi8(px[0], xmm_pxMask04To07))); /* Contains R01-08 */ + p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[0], xmm_pxMask08To11), _mm_shuffle_epi8(px[0], xmm_pxMask12To15))); /* Contains R09-16 */ + p[2] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[1], xmm_pxMask00To03), _mm_shuffle_epi8(px[1], xmm_pxMask04To07))); /* Contains G01-08 */ + p[3] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[1], xmm_pxMask08To11), _mm_shuffle_epi8(px[1], xmm_pxMask12To15))); /* Contains G09-16 */ + p[4] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMask00To03), _mm_shuffle_epi8(px[2], xmm_pxMask04To07))); /* Contains B01-08 */ + p[5] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px[2], xmm_pxMask08To11), _mm_shuffle_epi8(px[2], xmm_pxMask12To15))); /* Contains B09-16 */ +} + inline void rpp_load48_u8pkd3_to_f32pln3_mirror_avx(Rpp8u *srcPtr, __m256 *p) { __m128i px[4]; @@ -3762,6 +3859,20 @@ inline void rpp_resize_nn_load_u8pkd3(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _ p = _mm_shuffle_epi8(px[0], xmm_pkd_mask); // Shuffle to obtain 4 RGB [R01|G01|B01|R11|G11|B11|R21|G21|B21|R31|G31|B31|00|00|00|00] } +template +inline void rpp_resize_nn_extract_pkd3_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p) +{ + p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[0] + 1), *(srcRowPtrsForInterp + loc[0] + 2), + *(srcRowPtrsForInterp + loc[1]), *(srcRowPtrsForInterp + loc[1] + 1), *(srcRowPtrsForInterp + loc[1] + 2), + *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[2] + 1), *(srcRowPtrsForInterp + loc[2] + 2), + *(srcRowPtrsForInterp + loc[3]), *(srcRowPtrsForInterp + loc[3] + 1), *(srcRowPtrsForInterp + loc[3] + 2), + *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[4] + 1), *(srcRowPtrsForInterp + loc[4] + 2), + *(srcRowPtrsForInterp + loc[5]), *(srcRowPtrsForInterp + loc[5] + 1), *(srcRowPtrsForInterp + loc[5] + 2), + *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[6] + 1), *(srcRowPtrsForInterp + loc[6] + 2), + *(srcRowPtrsForInterp + loc[7]), *(srcRowPtrsForInterp + loc[7] + 1), *(srcRowPtrsForInterp + loc[7] + 2), + 0, 0, 0, 0, 0, 0, 0, 0); +} + inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p) { __m128i px[4]; @@ -3774,6 +3885,16 @@ inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _ p = _mm_unpacklo_epi8(px[0], px[1]); // unpack to obtain [R01|R11|R21|R31|00|00|00|00|00|00|00|00|00|00|00|00] } +template +inline void rpp_resize_nn_extract_pln1_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p) +{ + p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]), + *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]), + *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]), + *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 *p) { p[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]); // LOC0 load [R01|G01|B01|R02] - Need RGB 01 @@ -3783,6 +3904,42 @@ inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, R _MM_TRANSPOSE4_PS(p[0], p[1], p[2], pTemp); // Transpose to obtain RGB in each vector } +inline void rpp_resize_nn_load_f32pkd3_to_f32pln3_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p) +{ + __m128 p128[8]; + p128[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]); + p128[1] = _mm_loadu_ps(srcRowPtrsForInterp + loc[1]); + p128[2] = _mm_loadu_ps(srcRowPtrsForInterp + loc[2]); + p128[3] = _mm_loadu_ps(srcRowPtrsForInterp + loc[3]); + _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]); + p128[4] = _mm_loadu_ps(srcRowPtrsForInterp + loc[4]); + p128[5] = _mm_loadu_ps(srcRowPtrsForInterp + loc[5]); + p128[6] = _mm_loadu_ps(srcRowPtrsForInterp + loc[6]); + p128[7] = _mm_loadu_ps(srcRowPtrsForInterp + loc[7]); + _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]); + p[0] = _mm256_setr_m128(p128[0], p128[4]); + p[1] = _mm256_setr_m128(p128[1], p128[5]); + p[2] = _mm256_setr_m128(p128[2], p128[6]); +} + +inline void rpp_resize_nn_load_f16pkd3_to_f32pln3_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p) +{ + p[0] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]), + (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]), + (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]), + (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7])); + + p[1] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 1), + (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 1), + (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 1), + (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 1)); + + p[2] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 2), + (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 2), + (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 2), + (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 2)); +} + inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 &p) { __m128 pTemp[4]; @@ -3795,6 +3952,22 @@ inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, p = _mm_unpacklo_ps(pTemp[0], pTemp[1]); // Unpack to obtain [R01|R11|R21|R31] } +inline void rpp_resize_nn_load_f32pln1_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p) +{ + p = _mm256_setr_ps(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]), + *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]), + *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]), + *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7])); +} + +inline void rpp_resize_nn_load_f16pln1_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p) +{ + p = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]), + (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]), + (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]), + (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7])); +} + inline void rpp_resize_nn_load_i8pkd3(Rpp8s *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p) { __m128i px[4]; diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 3f32dbc04..721800c80 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -55,7 +55,7 @@ typedef union { float f1[5]; typedef union { float f1[6]; float2 f2[3]; } d_float6; typedef union { float f1[7]; } d_float7; typedef union { float f1[8]; float2 f2[4]; float4 f4[2]; } d_float8; -typedef union { float f1[9]; } d_float9; +typedef union { float f1[9]; float3 f3[3]; } d_float9; typedef union { float f1[12]; float4 f4[3]; } d_float12; typedef union { float f1[16]; float4 f4[4]; d_float8 f8[2]; } d_float16; typedef union { float f1[24]; float2 f2[12]; float3 f3[8]; float4 f4[6]; d_float8 f8[3]; } d_float24; @@ -1776,6 +1776,22 @@ __device__ __forceinline__ void rpp_hip_math_multiply24_const(d_float24 *src_f24 dst_f24->f4[5] = src_f24->f4[5] * multiplier_f4; } +// d_float8 divide + +__device__ __forceinline__ void rpp_hip_math_divide8(d_float8 *src1Ptr_f8, d_float8 *src2Ptr_f8, d_float8 *dstPtr_f8) +{ + dstPtr_f8->f4[0] = src1Ptr_f8->f4[0] / src2Ptr_f8->f4[0]; + dstPtr_f8->f4[1] = src1Ptr_f8->f4[1] / src2Ptr_f8->f4[1]; +} + +// d_float8 divide with constant + +__device__ __forceinline__ void rpp_hip_math_divide8_const(d_float8 *src_f8, d_float8 *dst_f8, float4 divisor_f4) +{ + dst_f8->f4[0] = divisor_f4 / src_f8->f4[0]; + dst_f8->f4[1] = divisor_f4 / src_f8->f4[1]; +} + // d_float8 bitwiseAND __device__ __forceinline__ void rpp_hip_math_bitwiseAnd8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8) @@ -1869,6 +1885,21 @@ __device__ __forceinline__ float rpp_hip_math_sinc(float x) return (fabsf(x) < 1e-5f) ? (1.0f - x * x * ONE_OVER_6) : sinf(x) / x; } +__device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst_f8) +{ + for(int i = 0; i < 8; i++) + src_f8->f1[i] = (!src_f8->f1[i]) ? std::nextafter(0.0f, 1.0f) : fabsf(src_f8->f1[i]); + + dst_f8->f1[0] = __logf(src_f8->f1[0]); + dst_f8->f1[1] = __logf(src_f8->f1[1]); + dst_f8->f1[2] = __logf(src_f8->f1[2]); + dst_f8->f1[3] = __logf(src_f8->f1[3]); + dst_f8->f1[4] = __logf(src_f8->f1[4]); + dst_f8->f1[5] = __logf(src_f8->f1[5]); + dst_f8->f1[6] = __logf(src_f8->f1[6]); + dst_f8->f1[7] = __logf(src_f8->f1[7]); +} + // /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/ template @@ -1913,7 +1944,8 @@ __device__ __forceinline__ float rpp_hip_rng_xorwow_f32(T *xorwowState) return outFloat - 1; // return 0 <= outFloat < 1 } -__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(RpptXorwowState *xorwowState, d_float8 *randomNumbersPtr_f8) +template +__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(T *xorwowState, d_float8 *randomNumbersPtr_f8) { randomNumbersPtr_f8->f1[0] = rpp_hip_rng_xorwow_f32(xorwowState); randomNumbersPtr_f8->f1[1] = rpp_hip_rng_xorwow_f32(xorwowState); diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp index b98145be0..466e51e09 100644 --- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp +++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp @@ -30,5 +30,6 @@ SOFTWARE. #include "kernel/subtract_scalar.hpp" #include "kernel/multiply_scalar.hpp" #include "kernel/magnitude.hpp" +#include "kernel/log.hpp" #endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp index 9388ed6bd..ce7450aab 100644 --- a/src/modules/cpu/host_tensor_effects_augmentations.hpp +++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp @@ -31,6 +31,8 @@ SOFTWARE. #include "kernel/noise_shot.hpp" #include "kernel/noise_gaussian.hpp" #include "kernel/non_linear_blend.hpp" +#include "kernel/jitter.hpp" +#include "kernel/glitch.hpp" #include "kernel/water.hpp" #include "kernel/ricap.hpp" #include "kernel/vignette.hpp" diff --git a/src/modules/cpu/host_tensor_geometric_augmentations.hpp b/src/modules/cpu/host_tensor_geometric_augmentations.hpp index cc7a22c8f..9facb0d78 100644 --- a/src/modules/cpu/host_tensor_geometric_augmentations.hpp +++ b/src/modules/cpu/host_tensor_geometric_augmentations.hpp @@ -35,6 +35,8 @@ SOFTWARE. #include "kernel/warp_affine.hpp" #include "kernel/phase.hpp" #include "kernel/slice.hpp" +#include "kernel/lens_correction.hpp" +#include "kernel/transpose.hpp" #include "kernel/crop_and_patch.hpp" #include "kernel/flip_voxel.hpp" diff --git a/src/modules/cpu/kernel/color_temperature.hpp b/src/modules/cpu/kernel/color_temperature.hpp index 1358ac800..dbe33a51e 100644 --- a/src/modules/cpu/kernel/color_temperature.hpp +++ b/src/modules/cpu/kernel/color_temperature.hpp @@ -30,7 +30,7 @@ RppStatus color_temperature_u8_u8_host_tensor(Rpp8u *srcPtr, RpptDescPtr srcDescPtr, Rpp8u *dstPtr, RpptDescPtr dstDescPtr, - Rpp8s *adjustmentValueTensor, + Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, RppLayoutParams layoutParams) @@ -269,7 +269,7 @@ RppStatus color_temperature_f32_f32_host_tensor(Rpp32f *srcPtr, RpptDescPtr srcDescPtr, Rpp32f *dstPtr, RpptDescPtr dstDescPtr, - Rpp8s *adjustmentValueTensor, + Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, RppLayoutParams layoutParams) @@ -508,7 +508,7 @@ RppStatus color_temperature_f16_f16_host_tensor(Rpp16f *srcPtr, RpptDescPtr srcDescPtr, Rpp16f *dstPtr, RpptDescPtr dstDescPtr, - Rpp8s *adjustmentValueTensor, + Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, RppLayoutParams layoutParams) @@ -799,7 +799,7 @@ RppStatus color_temperature_i8_i8_host_tensor(Rpp8s *srcPtr, RpptDescPtr srcDescPtr, Rpp8s *dstPtr, RpptDescPtr dstDescPtr, - Rpp8s *adjustmentValueTensor, + Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, RppLayoutParams layoutParams) diff --git a/src/modules/cpu/kernel/glitch.hpp b/src/modules/cpu/kernel/glitch.hpp new file mode 100644 index 000000000..9a8e33410 --- /dev/null +++ b/src/modules/cpu/kernel/glitch.hpp @@ -0,0 +1,690 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +inline void compute_src_loc(int row , int col, Rpp32s *locArray, RpptDescPtr srcDescPtr, RpptChannelOffsets *rgbOffsets, RpptROI roi, int batchCount, int channelValue) +{ + int xR, yR, xG, yG, xB, yB; + xR = col + rgbOffsets[batchCount].r.x; + yR = row + rgbOffsets[batchCount].r.y; + xG = col + rgbOffsets[batchCount].g.x; + yG = row + rgbOffsets[batchCount].g.y; + xB = col + rgbOffsets[batchCount].b.x; + yB = row + rgbOffsets[batchCount].b.y; + + if (xR >= roi.xywhROI.roiWidth || xR < roi.xywhROI.xy.x || yR >= roi.xywhROI.roiHeight || yR < roi.xywhROI.xy.y) + { + xR = col; + yR = row; + } + + if (xG >= roi.xywhROI.roiWidth || xG < roi.xywhROI.xy.x || yG >= roi.xywhROI.roiHeight || yG < roi.xywhROI.xy.y) + { + xG = col; + yG = row; + } + + if (xB >= roi.xywhROI.roiWidth || xB < roi.xywhROI.xy.x || yB >= roi.xywhROI.roiHeight || yB < roi.xywhROI.xy.y) + { + xB = col; + yB = row; + } + + locArray[0] = yR * srcDescPtr->strides.hStride + xR * channelValue; + locArray[1] = yG * srcDescPtr->strides.hStride + xG * channelValue; + locArray[2] = yB * srcDescPtr->strides.hStride + xB * channelValue; +} + +RppStatus glitch_u8_u8_host_tensor(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration. + + Rpp8u *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp8u *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u* dstRowPtrTempR = dstPtrRow; + Rpp8u* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride; + Rpp8u* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + __m256 p[3]; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + rpp_simd_load(rpp_glitch_load24_u8pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray); + rpp_simd_store(rpp_store24_f32pln3_to_u8pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p); // simd stores + + dstRowPtrTempR += 8; + dstRowPtrTempG += 8; + dstRowPtrTempB += 8; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0); + *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1); + *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u vectorIncrement = 16; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 16) * 16) - 16; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 16) + { + __m256 p[6]; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + Rpp32u rLoc = glitchSrcLocArray[0]; + Rpp32u gLoc = srcDescPtr->strides.cStride + glitchSrcLocArray[1]; + Rpp32u bLoc = 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2]; + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3_avx, srcPtrChannel + rLoc, srcPtrChannel + gLoc, srcPtrChannel + bLoc, p); + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3_avx, dstPtrTemp, p); // simd stores + dstPtrTemp += 48; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 3; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW )) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u vectorIncrement = 32; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 32) * 32) - 32; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + { + __m256i p; + p = _mm256_loadu_si256((__m256i *)(srcPtrChannel + (glitchSrcLocArray[c] + (c * srcDescPtr->strides.cStride)))); + _mm256_storeu_si256((__m256i *)(dstPtrTemp + (c * srcDescPtr->strides.cStride)), p); + } + dstPtrTemp += 32; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 1; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 10) * 10) - 10; + Rpp32s vectorIncrement = 10; + Rpp32s vectorIncrementPkd = 30; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 10) + { + __m256i p; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + rpp_simd_load(rpp_glitch_load30_u8pkd3_to_u8pkd3_avx, srcPtrChannel, glitchSrcLocArray, p); + _mm256_storeu_si256((__m256i *)(dstPtrTemp), p); + dstPtrTemp += 30; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + for (int c = 0; c < 3; c++) + *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + } + return RPP_SUCCESS; +} + +RppStatus glitch_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration. + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp32f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f* dstRowPtrTempR = dstPtrRow; + Rpp32f* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride; + Rpp32f* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + __m256 p[3]; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + rpp_simd_load(rpp_glitch_load24_f32pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p); // simd stores + + dstRowPtrTempR += 8; + dstRowPtrTempG += 8; + dstRowPtrTempB += 8; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0); + *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1); + *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u vectorIncrement = 8; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8; + + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + __m256 p[3]; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + p[0] = _mm256_loadu_ps(srcPtrChannel + glitchSrcLocArray[0]); + p[1] = _mm256_loadu_ps(srcPtrChannel + srcDescPtr->strides.cStride + glitchSrcLocArray[1]); + p[2] = _mm256_loadu_ps(srcPtrChannel + 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2]); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores + dstPtrTemp += 24; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 3; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW )) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u vectorIncrement = 8; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8; + + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + { + __m256 p; + p = _mm256_loadu_ps(srcPtrChannel + (glitchSrcLocArray[c] + c * srcDescPtr->strides.cStride)); + _mm256_storeu_ps((dstPtrTemp + c * srcDescPtr->strides.cStride), p); + } + dstPtrTemp += 8; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 1; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 2) * 2) - 2; + Rpp32s vectorIncrement = 2; + Rpp32s vectorIncrementPkd = 6; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 2) + { + __m256 p; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + rpp_simd_load(rpp_glitch_load6_f32pkd3_to_f32pkd3_avx, srcPtrChannel, glitchSrcLocArray, p); + _mm256_storeu_si256((__m256i *)(dstPtrTemp), p); + dstPtrTemp += 6; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + for (int c = 0; c < 3; c++) + *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + } + return RPP_SUCCESS; +} + +RppStatus glitch_f16_f16_host_tensor(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration. + + Rpp16f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp16f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f* dstRowPtrTempR = dstPtrRow; + Rpp16f* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride; + Rpp16f* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride; + for (int vectorLoopCount = 0; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0); + *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1); + *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f* dstPtrTemp = dstPtrRow; + for (int vectorLoopCount = 0; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 3; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW )) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f* dstPtrTemp = dstPtrRow; + for (int i = 0; i < roi.xywhROI.roiWidth; i++) + { + compute_src_loc(dstLocRow, i, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 1; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f* dstPtrTemp = dstPtrRow; + for (int i = 0; i < roi.xywhROI.roiWidth; i++) + { + compute_src_loc(dstLocRow, i, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + for (int c = 0; c < 3; c++) + *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + } + return RPP_SUCCESS; +} + +RppStatus glitch_i8_i8_host_tensor(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for (int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32s glitchSrcLocArray[3] = {0}; // Since 3 destination pixels, one for each channel, are processed per iteration. + + Rpp8s *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier; + Rpp8s *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 8) * 8) - 8; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s* dstRowPtrTempR = dstPtrRow; + Rpp8s* dstRowPtrTempG = dstPtrRow + dstDescPtr->strides.cStride; + Rpp8s* dstRowPtrTempB = dstPtrRow + 2 * dstDescPtr->strides.cStride; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 8) + { + __m256 p[3]; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + rpp_simd_load(rpp_glitch_load24_i8pkd3_to_f32pln3_avx, srcPtrChannel, p, glitchSrcLocArray); + rpp_simd_store(rpp_store24_f32pln3_to_i8pln3_avx, dstRowPtrTempR, dstRowPtrTempG, dstRowPtrTempB, p); // simd stores + + dstRowPtrTempR += 8; + dstRowPtrTempG += 8; + dstRowPtrTempB += 8; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + *dstRowPtrTempR++ = *(srcPtrChannel + glitchSrcLocArray[0] + 0); + *dstRowPtrTempG++ = *(srcPtrChannel + glitchSrcLocArray[1] + 1); + *dstRowPtrTempB++ = *(srcPtrChannel + glitchSrcLocArray[2] + 2); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u vectorIncrement = 16; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 16) * 16) - 16; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 16) + { + __m256 p[6]; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + Rpp32u rLoc = glitchSrcLocArray[0]; + Rpp32u gLoc = srcDescPtr->strides.cStride + glitchSrcLocArray[1]; + Rpp32u bLoc = 2 * srcDescPtr->strides.cStride + glitchSrcLocArray[2]; + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3_avx, srcPtrChannel + rLoc, srcPtrChannel + gLoc, srcPtrChannel + bLoc, p); + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3_avx, dstPtrTemp, p); // simd stores + dstPtrTemp += 48; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 3; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + } + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW )) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u vectorIncrement = 32; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 32) * 32) - 32; + + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + { + __m256i p; + p = _mm256_loadu_si256((__m256i *)(srcPtrChannel + (glitchSrcLocArray[c] + (c * srcDescPtr->strides.cStride)))); + _mm256_storeu_si256((__m256i *)(dstPtrTemp + (c * srcDescPtr->strides.cStride)), p); + } + dstPtrTemp += 32; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 1); + for (int c = 0; c < 3; c++) + *(dstPtrTemp + c * dstDescPtr->strides.cStride) = *(srcPtrChannel + glitchSrcLocArray[c] + c *srcDescPtr->strides.cStride); + dstPtrTemp += 1; + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + else if((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32u alignedLength = (((roi.xywhROI.roiWidth)/ 10) * 10) - 10; + Rpp32s vectorIncrement = 10; + Rpp32s vectorIncrementPkd = 30; + for (int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s* dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += 10) + { + __m256i p; + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + rpp_simd_load(rpp_glitch_load30_i8pkd3_to_i8pkd3_avx, srcPtrChannel, glitchSrcLocArray, p); + _mm256_storeu_si256((__m256i *)(dstPtrTemp), p); + dstPtrTemp += 30; + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + compute_src_loc(dstLocRow, vectorLoopCount, glitchSrcLocArray, srcDescPtr, rgbOffsets, roi, batchCount, 3); + for (int c = 0; c < 3; c++) + *dstPtrTemp++ = *(srcPtrChannel + glitchSrcLocArray[c] + c); + } + + dstPtrRow += dstDescPtr->strides.hStride; + } + + } + } + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/jitter.hpp b/src/modules/cpu/kernel/jitter.hpp new file mode 100644 index 000000000..ec717150a --- /dev/null +++ b/src/modules/cpu/kernel/jitter.hpp @@ -0,0 +1,929 @@ +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus jitter_u8_u8_host_tensor(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp8u *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp8u *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth - 1); + __m256 pBound = _mm256_set1_ps(bound); + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_u8pkd3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow); + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, srcDescPtr->c, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_u8pln3_to_u8pkd3_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_u8_to_u8_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRow + loc); + *dstPtrTemp++ = *(srcPtrRow + 1 + loc); + *dstPtrTemp++ = *(srcPtrRow + 2 + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp8u *dstPtrTempChn, *srcPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + for(int c = 0; c < srcDescPtr->c; c++) + { + __m256i pxRow; + rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow); + rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow)); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp8u *dstPtrTempChn = dstPtrTemp; + Rpp8u *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = *(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus jitter_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1); + __m256 pBound = _mm256_set1_ps(bound); + + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f32pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow); + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 pxRow[4]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]); + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]); + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + __m256 pRow; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + rpp_simd_load(rpp_load8_f32_to_f32_avx, (srcPtrChannel + loc), &pRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &pRow); + dstPtrTemp += 3; + } +#endif + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f *srcPtrTempChn, *dstPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + + for (int c = 0; c < dstDescPtr->c; c++) + { + __m256 pxRow; + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrTempChn, srcLocArray, pxRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTempChn, &pxRow); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32f *dstPtrTempChn = dstPtrTemp; + Rpp32f *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = (Rpp32f)*(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus jitter_f16_f16_host_tensor(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp16f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp16f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1); + __m256 pBound = _mm256_set1_ps(bound); + + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8]; + __m256 pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f16pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, pxRow); + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt]; + dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt]; + dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt]; + } + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f dstPtrTemp_ps[25]; + __m256 pxRow[4]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]); + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]); + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, pxRow); + for(int cnt = 0; cnt < vectorIncrement; cnt++) + dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32f srcPtrTemp_ps[8], dstPtrTemp_ps[8]; + Rpp32s loc; + __m256 pRow; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp16f)srcPtrChannel[loc + cnt]; + } + + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &pRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pRow); + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + } + dstPtrTemp += 3; + } +#endif + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp16f *srcPtrTempChn, *dstPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + + for (int c = 0; c < dstDescPtr->c; c++) + { + Rpp32f dstPtrTemp_ps[8]; + __m256 pxRow; + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrTempChn, srcLocArray, pxRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pxRow); + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + dstPtrTempChn[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + } + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp16f *dstPtrTempChn = dstPtrTemp; + Rpp16f *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = (Rpp16f)*(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus jitter_i8_i8_host_tensor(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp8s *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp8s *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1); + __m256 pBound = _mm256_set1_ps(bound); + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_i8pkd3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow); + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_i8pln3_to_i8pkd3_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_i8_to_i8_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + loc); + *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 1 + loc); + *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 2 + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp8s *dstPtrTempChn, *srcPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + for(int c = 0; c < srcDescPtr->c; c++) + { + __m256i pxRow; + rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow); + rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow)); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp8s *dstPtrTempChn = dstPtrTemp; + Rpp8s *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = (Rpp8s)*(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/cpu/kernel/lens_correction.hpp b/src/modules/cpu/kernel/lens_correction.hpp new file mode 100644 index 000000000..1632568a5 --- /dev/null +++ b/src/modules/cpu/kernel/lens_correction.hpp @@ -0,0 +1,178 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" +#include + +// Compute Inverse matrix (3x3) +inline void get_inverse(float *mat, float *invMat) +{ + float det = mat[0] * (mat[4] * mat[8] - mat[7] * mat[5]) - mat[1] * (mat[3] * mat[8] - mat[5] * mat[6]) + mat[2] * (mat[3] * mat[7] - mat[4] * mat[6]); + if(det != 0) + { + float invDet = 1 / det; + invMat[0] = (mat[4] * mat[8] - mat[7] * mat[5]) * invDet; + invMat[1] = (mat[2] * mat[7] - mat[1] * mat[8]) * invDet; + invMat[2] = (mat[1] * mat[5] - mat[2] * mat[4]) * invDet; + invMat[3] = (mat[5] * mat[6] - mat[3] * mat[8]) * invDet; + invMat[4] = (mat[0] * mat[8] - mat[2] * mat[6]) * invDet; + invMat[5] = (mat[3] * mat[2] - mat[0] * mat[5]) * invDet; + invMat[6] = (mat[3] * mat[7] - mat[6] * mat[4]) * invDet; + invMat[7] = (mat[6] * mat[1] - mat[0] * mat[7]) * invDet; + invMat[8] = (mat[0] * mat[4] - mat[3] * mat[1]) * invDet; + } +} + +inline void compute_lens_correction_remap_tables_host_tensor(RpptDescPtr srcDescPtr, + Rpp32f *rowRemapTable, + Rpp32f *colRemapTable, + RpptDescPtr tableDescPtr, + Rpp32f *cameraMatrixTensor, + Rpp32f *distortionCoeffsTensor, + RpptROIPtr roiTensorPtrSrc, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + Rpp32f *rowRemapTableTemp, *colRemapTableTemp; + rowRemapTableTemp = rowRemapTable + batchCount * tableDescPtr->strides.nStride; + colRemapTableTemp = colRemapTable + batchCount * tableDescPtr->strides.nStride; + + // cameraMatrix is a 3x3 matrix thus increment by 9 to iterate from one tensor in a batch to another + Rpp32f *cameraMatrix = cameraMatrixTensor + batchCount * 9; + Rpp32f *distortionCoeffs = distortionCoeffsTensor + batchCount * 8; + Rpp32s height = roiTensorPtrSrc[batchCount].xywhROI.roiHeight; + Rpp32s width = roiTensorPtrSrc[batchCount].xywhROI.roiWidth; + Rpp32u alignedLength = width & ~7; + Rpp32s vectorIncrement = 8; + + Rpp32f invCameraMatrix[9]; + std::fill(invCameraMatrix, invCameraMatrix + 9, 0.0f); // initialize all values in invCameraMatrix to zero + get_inverse(cameraMatrix, invCameraMatrix); + Rpp32f *invMat = &invCameraMatrix[0]; + + // Get radial and tangential distortion coefficients + Rpp32f rCoeff[6] = { distortionCoeffs[0], distortionCoeffs[1], distortionCoeffs[4], distortionCoeffs[5], distortionCoeffs[6], distortionCoeffs[7] }; + Rpp32f tCoeff[2] = { distortionCoeffs[2], distortionCoeffs[3] }; + + __m256 pRCoeff[6], pTCoeff[2]; + pRCoeff[0] = _mm256_set1_ps(rCoeff[0]); + pRCoeff[1] = _mm256_set1_ps(rCoeff[1]); + pRCoeff[2] = _mm256_set1_ps(rCoeff[2]); + pRCoeff[3] = _mm256_set1_ps(rCoeff[3]); + pRCoeff[4] = _mm256_set1_ps(rCoeff[4]); + pRCoeff[5] = _mm256_set1_ps(rCoeff[5]); + pTCoeff[0] = _mm256_set1_ps(tCoeff[0]); + pTCoeff[1] = _mm256_set1_ps(tCoeff[1]); + + Rpp32f u0 = cameraMatrix[2], v0 = cameraMatrix[5]; + Rpp32f fx = cameraMatrix[0], fy = cameraMatrix[4]; + __m256 pFx, pFy, pU0, pV0; + pFx = _mm256_set1_ps(fx); + pFy = _mm256_set1_ps(fy); + pU0 = _mm256_set1_ps(u0); + pV0 = _mm256_set1_ps(v0); + + __m256 pInvMat0, pInvMat3, pInvMat6; + pInvMat0 = _mm256_set1_ps(invMat[0]); + pInvMat3 = _mm256_set1_ps(invMat[3]); + pInvMat6 = _mm256_set1_ps(invMat[6]); + + __m256 pXCameraInit, pYCameraInit, pZCameraInit; + __m256 pXCameraIncrement, pYCameraIncrement, pZCameraIncrement; + pXCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat0); + pYCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat3); + pZCameraInit = _mm256_mul_ps(avx_pDstLocInit, pInvMat6); + pXCameraIncrement = _mm256_mul_ps(pInvMat0, avx_p8); + pYCameraIncrement = _mm256_mul_ps(pInvMat3, avx_p8); + pZCameraIncrement = _mm256_mul_ps(pInvMat6, avx_p8); + for(int i = 0; i < height; i++) + { + Rpp32f *rowRemapTableRow = rowRemapTableTemp + i * tableDescPtr->strides.hStride; + Rpp32f *colRemapTableRow = colRemapTableTemp + i * tableDescPtr->strides.hStride; + Rpp32f xCamera = i * invMat[1] + invMat[2]; + Rpp32f yCamera = i * invMat[4] + invMat[5]; + Rpp32f zCamera = i * invMat[7] + invMat[8]; + __m256 pXCamera = _mm256_add_ps(_mm256_set1_ps(xCamera), pXCameraInit); + __m256 pYCamera = _mm256_add_ps(_mm256_set1_ps(yCamera), pYCameraInit); + __m256 pZCamera = _mm256_add_ps(_mm256_set1_ps(zCamera), pZCameraInit); + int vectorLoopCount = 0; + for(; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + // float z = 1./zCamera, x = xCamera*z, y = yCamera*z; + __m256 pZ = _mm256_div_ps(avx_p1, pZCamera); + __m256 pX = _mm256_mul_ps(pXCamera, pZ); + __m256 pY = _mm256_mul_ps(pYCamera, pZ); + + // float xSquare = x*x, ySquare = y*y, r2 = xSquare + ySquare; + __m256 pXSquare = _mm256_mul_ps(pX, pX); + __m256 pYSquare = _mm256_mul_ps(pY, pY); + __m256 pR2 = _mm256_add_ps(pXSquare, pYSquare); + + // float xyMul2 = 2*x*y; + __m256 p2xy = _mm256_mul_ps(avx_p2, _mm256_mul_ps(pX, pY)); + + // float kr = std::fmaf(std::fmaf(std::fmaf(rCoeff[2], r2, rCoeff[1]), r2, rCoeff[0]), r2, 1) / std::fmaf(std::fmaf(std::fmaf(rCoeff[5], r2, rCoeff[4]), r2, rCoeff[3]), r2, 1); + __m256 pNum = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(pRCoeff[2], pR2, pRCoeff[1]), pR2, pRCoeff[0]), pR2, avx_p1); + __m256 pDen = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(pRCoeff[5], pR2, pRCoeff[4]), pR2, pRCoeff[3]), pR2, avx_p1); + __m256 pKR = _mm256_div_ps(pNum, pDen); + + // float colLoc = std::fmaf(fx, (std::fmaf(tCoeff[1], (std::fmaf(2, xSquare, r2)), std::fmaf(x, kr, (tCoeff[0] * xyMul2)))), u0); + __m256 pColLoc = _mm256_fmadd_ps(pFx, _mm256_fmadd_ps(pTCoeff[1], _mm256_fmadd_ps(avx_p2, pXSquare, pR2), _mm256_fmadd_ps(pX, pKR, _mm256_mul_ps(pTCoeff[0], p2xy))), pU0); + + // float rowLoc = std::fmaf(fy, (std::fmaf(tCoeff[0], (std::fmaf(2, ySquare, r2)), std::fmaf(y, kr, (tCoeff[1] * xyMul2)))), v0); + __m256 pRowLoc = _mm256_fmadd_ps(pFy, _mm256_fmadd_ps(pTCoeff[0], _mm256_fmadd_ps(avx_p2, pYSquare, pR2), _mm256_fmadd_ps(pY, pKR, _mm256_mul_ps(pTCoeff[1], p2xy))), pV0); + + _mm256_storeu_ps(rowRemapTableRow, pRowLoc); + _mm256_storeu_ps(colRemapTableRow, pColLoc); + rowRemapTableRow += vectorIncrement; + colRemapTableRow += vectorIncrement; + + // xCamera += invMat[0], yCamera += invMat[3], zCamera += invMat[6] + pXCamera = _mm256_add_ps(pXCamera, pXCameraIncrement); + pYCamera = _mm256_add_ps(pYCamera, pYCameraIncrement); + pZCamera = _mm256_add_ps(pZCamera, pZCameraIncrement); + } + for(; vectorLoopCount < width; vectorLoopCount++) + { + Rpp32f z = 1./zCamera, x = xCamera * z, y = yCamera * z; + Rpp32f xSquare = x * x, ySquare = y * y, r2 = xSquare + ySquare; + Rpp32f xyMul2 = 2 * x * y; + Rpp32f kr = std::fmaf(std::fmaf(std::fmaf(rCoeff[2], r2, rCoeff[1]), r2, rCoeff[0]), r2, 1) / std::fmaf(std::fmaf(std::fmaf(rCoeff[5], r2, rCoeff[4]), r2, rCoeff[3]), r2, 1); + Rpp32f colLoc = std::fmaf(fx, (std::fmaf(tCoeff[1], (std::fmaf(2, xSquare, r2)), std::fmaf(x, kr, (tCoeff[0] * xyMul2)))), u0); + Rpp32f rowLoc = std::fmaf(fy, (std::fmaf(tCoeff[0], (std::fmaf(2, ySquare, r2)), std::fmaf(y, kr, (tCoeff[1] * xyMul2)))), v0); + *rowRemapTableRow++ = rowLoc; + *colRemapTableRow++ = colLoc; + xCamera += invMat[0]; + yCamera += invMat[3]; + zCamera += invMat[6]; + } + } + } +} \ No newline at end of file diff --git a/src/modules/cpu/kernel/log.hpp b/src/modules/cpu/kernel/log.hpp new file mode 100644 index 000000000..5ec79b21c --- /dev/null +++ b/src/modules/cpu/kernel/log.hpp @@ -0,0 +1,563 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_common.hpp" + +// 1 pixel log helper functions +// NOTE: log(0) leads to undefined thus using nextafter() to avoid this result +// Also negative values are converted to positive by taking absolute of inputs +inline void compute_log(Rpp8u *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src); } +inline void compute_log(Rpp8s *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src + 128); } +inline void compute_log(Rpp16f *src, Rpp16f *dst) { *dst = (!*src) ? log(std::nextafter(0.0f, 1.0f)) : log(abs(*src)); } +inline void compute_log(Rpp32f *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(abs(*src)); } + +// Computes ND log recursively +template +void log_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim) +{ + if (!nDim) + compute_log(src, dst); + else + { + for (int i = 0; i < *dstShape; i++) + { + log_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1); + dst += *dstStrides; + src += *srcStrides; + } + } +} + +RppStatus log_generic_host_tensor(Rpp8u *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp8u *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + alignedLength = length[0] & ~15; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8u *srcPtrTemp = srcPtr1; + Rpp32f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8u *srcPtrRow = srcPtr1; + Rpp32f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp8u *srcPtrTemp = srcPtrRow; + Rpp32f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} + +RppStatus log_generic_host_tensor(Rpp8s *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp8s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + alignedLength = length[0] & ~15; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8s *srcPtrTemp = srcPtr1; + Rpp32f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8s *srcPtrRow = srcPtr1; + Rpp32f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp8s *srcPtrTemp = srcPtrRow; + Rpp32f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} + +RppStatus log_generic_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp32f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + alignedLength = length[0] & ~15; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtr1, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrTemp = srcPtr1; + Rpp32f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrRow = srcPtr1; + Rpp32f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp32f *srcPtrTemp = srcPtrRow; + Rpp32f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} + +RppStatus log_generic_host_tensor(Rpp16f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp16f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp16f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp16f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[16]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtr1[cnt]); + + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp16f *srcPtrTemp = srcPtr1; + Rpp16f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[16]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp16f *srcPtrRow = srcPtr1; + Rpp16f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp16f *srcPtrTemp = srcPtrRow; + Rpp16f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[16]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/cpu/kernel/normalize.hpp b/src/modules/cpu/kernel/normalize.hpp index dbe746d1a..94a1fd9fa 100644 --- a/src/modules/cpu/kernel/normalize.hpp +++ b/src/modules/cpu/kernel/normalize.hpp @@ -26,21 +26,6 @@ SOFTWARE. #include "rpp_cpu_simd.hpp" #include "rpp_cpu_common.hpp" -// Computes strides -void compute_strides(Rpp32u *strides, Rpp32u *shape, Rpp32u tensorDim) -{ - if (tensorDim > 0) - { - Rpp32u v = 1; - for (Rpp32u i = tensorDim - 1; i > 0; i--) - { - strides[i] = v; - v *= shape[i]; - } - strides[0] = v; - } -} - // Recursive reduction helper function to compute difference of input with mean and squares them up template void compute_diff_square_sum(Rpp32f &output, T *input, Rpp32s inputStride, Rpp32s numElements, Rpp32f mean) diff --git a/src/modules/cpu/kernel/transpose.hpp b/src/modules/cpu/kernel/transpose.hpp new file mode 100644 index 000000000..233db1044 --- /dev/null +++ b/src/modules/cpu/kernel/transpose.hpp @@ -0,0 +1,434 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" +using namespace std; + +inline void increment_ndim_ptr(Rpp32f **dstPtr, Rpp32u tensorDims, Rpp32u increment) +{ + for(int i = 0; i < tensorDims; i++) + dstPtr[i] += increment; +} + +inline void rpp_store16_f32_f32_channelwise(Rpp32f **dstPtr, __m128 *p) +{ + _mm_storeu_ps(dstPtr[0], p[0]); + _mm_storeu_ps(dstPtr[1], p[1]); + _mm_storeu_ps(dstPtr[2], p[2]); + _mm_storeu_ps(dstPtr[3], p[3]); +} + +inline void compute_2d_pln1_transpose(Rpp32f *srcPtrTemp, Rpp32f *dstPtrTemp, Rpp32u height, Rpp32u width, Rpp32u srcRowStride, Rpp32u dstRowStride) +{ + Rpp32u alignedRows = height & ~3; + Rpp32u alignedCols = width & ~7; + Rpp32u vectorIncrement = 8; + Rpp32u dstRowVectorStride = vectorIncrement * dstRowStride; + + Rpp32s i = 0; + for(Rpp32s k = 0; i < alignedRows; i += 4, k++) + { + Rpp32f *srcPtrRow[4], *dstPtrRow[8]; + for(int j = 0; j < 4; j++) + srcPtrRow[j] = srcPtrTemp + (i + j) * srcRowStride; + for(int j = 0; j < 8; j++) + dstPtrRow[j] = dstPtrTemp + j * dstRowStride + i; + + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for(; vectorLoopCount < alignedCols; vectorLoopCount += vectorIncrement) + { + __m256 pSrc[4]; + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[0], &pSrc[0]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[1], &pSrc[1]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[2], &pSrc[2]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow[3], &pSrc[3]); + + __m128 pDst[8]; + compute_transpose4x8_avx(pSrc, pDst); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[0], &pDst[0]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[1], &pDst[1]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[2], &pDst[2]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[3], &pDst[3]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[4], &pDst[4]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[5], &pDst[5]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[6], &pDst[6]); + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrRow[7], &pDst[7]); + + srcPtrRow[0] += vectorIncrement; + srcPtrRow[1] += vectorIncrement; + srcPtrRow[2] += vectorIncrement; + srcPtrRow[3] += vectorIncrement; + dstPtrRow[0] += dstRowVectorStride; + dstPtrRow[1] += dstRowVectorStride; + dstPtrRow[2] += dstRowVectorStride; + dstPtrRow[3] += dstRowVectorStride; + dstPtrRow[4] += dstRowVectorStride; + dstPtrRow[5] += dstRowVectorStride; + dstPtrRow[6] += dstRowVectorStride; + dstPtrRow[7] += dstRowVectorStride; + } +#endif + } + + // handle remaining columns + for(Rpp32s k = 0; k < alignedRows; k++) + { + Rpp32f *srcPtrRowTemp = srcPtrTemp + k * srcRowStride + alignedCols; + Rpp32f *dstPtrRowTemp = dstPtrTemp + alignedCols * dstRowStride + k; + for(Rpp32s j = alignedCols; j < width; j++) + { + *dstPtrRowTemp = *srcPtrRowTemp++; + dstPtrRowTemp += dstRowStride; + } + } + + // handle remaining rows + for( ; i < height; i++) + { + Rpp32f *srcPtrRowTemp = srcPtrTemp + i * srcRowStride; + Rpp32f *dstPtrRowTemp = dstPtrTemp + i; + for(Rpp32s j = 0; j < width; j++) + { + *dstPtrRowTemp = *srcPtrRowTemp; + srcPtrRowTemp++; + dstPtrRowTemp += dstRowStride; + } + } +} + +template +void transpose_generic_nd_recursive(T *dst, Rpp32u *dstStrides, T *src, Rpp32u *srcStrides, Rpp32u *dstShape, Rpp32u tensorDims) +{ + // exit case for recursion + if (tensorDims == 0) + { + *dst = *src; + } + else + { + for (int i = 0; i < *dstShape; i++) + { + transpose_generic_nd_recursive(dst, dstStrides + 1, src, srcStrides + 1, dstShape + 1, tensorDims - 1); + dst += *dstStrides; + src += *srcStrides; + } + } +} + +template +void transpose_generic_setup_and_run(T *srcPtrTemp, T *dstPtrTemp, Rpp32u *length, Rpp32u *perm, Rpp32u tensorDims) +{ + Rpp32u dstShape[RPPT_MAX_DIMS]; + Rpp32u srcStrides[RPPT_MAX_DIMS]; + Rpp32u dstStrides[RPPT_MAX_DIMS]; + + // compute output shape + for(Rpp32u i = 0; i < tensorDims; i++) + dstShape[i] = length[perm[i]]; + + // compute output strides + compute_strides(dstStrides, dstShape, tensorDims); + + // compute input strides and update as per the permute order + Rpp32u tempStrides[RPPT_MAX_DIMS]; + compute_strides(tempStrides, length, tensorDims); + for(int i = 0; i < tensorDims; i++) + srcStrides[i] = tempStrides[perm[i]]; + + // perform transpose as per the permute order + transpose_generic_nd_recursive(dstPtrTemp, dstStrides, srcPtrTemp, srcStrides, dstShape, tensorDims); +} + +RppStatus transpose_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *permTensor, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u tensorDims = dstGenericDescPtr->numDims - 1; // exclude batchsize from input dims + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + // get the starting address of begin and length values from roiTensor + Rpp32u *roi = roiTensor + batchCount * tensorDims * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[tensorDims]; + Rpp32u *perm = permTensor; + + bool copyInput = true; + for(int i = 0; i < tensorDims; i++) + copyInput *= (perm[i] == i); + + // do memcpy of input to output since output order is same as input order + if(copyInput) + { + memcpy(dstPtrTemp, srcPtrTemp, (size_t)(srcGenericDescPtr->strides[0] * sizeof(Rpp32f))); + } + else + { + for(int i = 1; i < tensorDims; i++) + srcPtrTemp += begin[i - 1] * srcGenericDescPtr->strides[i]; + + if (tensorDims == 2 && perm[0] == 1 && perm[1] == 0) + { + // Optimized AVX version for 2D PLN1 inputs + compute_2d_pln1_transpose(srcPtrTemp, dstPtrTemp, length[0], length[1], srcGenericDescPtr->strides[1], dstGenericDescPtr->strides[1]); + } + else if (tensorDims == 3) + { + // Optimized AVX version for 3D inputs of shape(x, y, 16) and permutation order (2, 0, 1) (usecases : Deepcam training) + if(perm[0] == 2 && perm[1] == 0 && perm[2] == 1 && length[2] == 16) + { + Rpp32u height = length[0]; + Rpp32u width = length[1]; + Rpp32u channels = 16; + Rpp32u bufferLength = width * channels; + Rpp32u alignedLength = bufferLength & ~63; + Rpp32u vectorIncrement = 64; + Rpp32u vectorIncrementPerChannel = 4; + + // initialize pointers for 16 channel + Rpp32f *dstPtrChannel[16]; + for(int i = 0; i < 16; i++) + dstPtrChannel[i] = dstPtrTemp + i * dstGenericDescPtr->strides[1]; + + // loop over rows + for(int i = 0; i < height; i++) + { + Rpp32f *srcPtrRow = srcPtrTemp; + + // update temporary pointers for 16 channel + Rpp32f *dstPtrTempChannel[16]; + for(int k = 0; k < 16; k++) + dstPtrTempChannel[k] = dstPtrChannel[k]; + + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for( ; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 pSrc[8]; + // load 64 values for source + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow, &pSrc[0]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 16, &pSrc[1]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 32, &pSrc[2]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 48, &pSrc[3]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 8, &pSrc[4]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 24, &pSrc[5]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 40, &pSrc[6]); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrRow + 56, &pSrc[7]); + + __m128 pDst[16]; + compute_transpose4x8_avx(&pSrc[0], &pDst[0]); + compute_transpose4x8_avx(&pSrc[4], &pDst[8]); + + // store 4 values in output per channel + rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[0], &pDst[0]); + rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[4], &pDst[4]); + rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[8], &pDst[8]); + rpp_store16_f32_f32_channelwise(&dstPtrTempChannel[12], &pDst[12]); + + srcPtrRow += vectorIncrement; + increment_ndim_ptr(dstPtrTempChannel, 16, vectorIncrementPerChannel); + } +#endif + for( ; vectorLoopCount < bufferLength; vectorLoopCount += 16) + { + for(int k = 0; k < 16; k++) + *dstPtrTempChannel[k] = srcPtrRow[k]; + + srcPtrRow += 16; + increment_ndim_ptr(dstPtrTempChannel, 16, 1); + } + srcPtrTemp += srcGenericDescPtr->strides[1]; + increment_ndim_ptr(dstPtrChannel, 16, dstGenericDescPtr->dims[3]); + } + } + // Optimized AVX version for 3D inputs and permutation order (1, 0, 2) + else if(perm[0] == 1 && perm[1] == 0 && perm[2] == 2) + { + Rpp32f *srcPtrRow = srcPtrTemp; + Rpp32f *dstPtrRow = dstPtrTemp; + Rpp32u height = length[0]; + Rpp32u width = length[1]; + Rpp32u channels = length[2]; + Rpp32u copySizeInBytes = channels * sizeof(Rpp32f); + for(int i = 0; i < height; i++) + { + Rpp32f *srcPtrRowTemp = srcPtrRow; + Rpp32f *dstPtrRowTemp = dstPtrRow; + for(int j = 0; j < width; j++) + { + memcpy(dstPtrRowTemp, srcPtrRowTemp, copySizeInBytes); + srcPtrRowTemp += srcGenericDescPtr->strides[2]; + dstPtrRowTemp += dstGenericDescPtr->strides[1]; + } + srcPtrRow += srcGenericDescPtr->strides[1]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + } + // Optimized AVX version for 3D inputs and permutation order (0, 2, 1) + else if(perm[0] == 0 && perm[1] == 2 && perm[2] == 1) + { + Rpp32f *srcPtrRow = srcPtrTemp; + Rpp32f *dstPtrRow = dstPtrTemp; + for(int i = 0; i < length[0]; i++) + { + compute_2d_pln1_transpose(srcPtrTemp, dstPtrTemp, length[1], length[2], srcGenericDescPtr->strides[2], dstGenericDescPtr->strides[2]); + + // increment src and dst pointers + srcPtrTemp += srcGenericDescPtr->strides[1]; + dstPtrTemp += dstGenericDescPtr->strides[1]; + } + } + else + { + transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims); + } + } + else if (tensorDims == 4) + { + // Optimized AVX version for 4D inputs and permutation order (1, 2, 3, 0) + Rpp32u vectorIncrement = 8; + if(perm[0] == 1 && perm[1] == 2 && perm[2] == 3 && perm[3] == 0) + { + Rpp32u bufferLength = length[perm[3]]; + Rpp32u alignedLength = bufferLength & ~7; + Rpp32f *srcPtr0 = srcPtrTemp; + Rpp32f *dstPtr0 = dstPtrTemp; + Rpp32u stridesIncrement[8] = {0, srcGenericDescPtr->strides[1], 2 * srcGenericDescPtr->strides[1], 3 * srcGenericDescPtr->strides[1], + 4 * srcGenericDescPtr->strides[1], 5 * srcGenericDescPtr->strides[1], 6 * srcGenericDescPtr->strides[1], 7 * srcGenericDescPtr->strides[1]}; + Rpp32u srcIncrement = vectorIncrement * srcGenericDescPtr->strides[1]; + for(int i = 0; i < length[perm[0]]; i++) + { + Rpp32f *srcPtr1 = srcPtr0; + Rpp32f *dstPtr1 = dstPtr0; + for(int j = 0; j < length[perm[1]]; j++) + { + Rpp32f *srcPtr2 = srcPtr1; + Rpp32f *dstPtr2 = dstPtr1; + for(int k = 0; k < length[perm[2]]; k++) + { + Rpp32f *srcPtr3 = srcPtr2; + Rpp32f *dstPtr3 = dstPtr2; + + Rpp32u vectorLoopCount = 0; +#if __AVX2__ + for( ; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 pSrc = _mm256_setr_ps(srcPtr3[stridesIncrement[0]], srcPtr3[stridesIncrement[1]], srcPtr3[stridesIncrement[2]], srcPtr3[stridesIncrement[3]], + srcPtr3[stridesIncrement[4]], srcPtr3[stridesIncrement[5]], srcPtr3[stridesIncrement[6]], srcPtr3[stridesIncrement[7]]); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtr3, &pSrc); + srcPtr3 += srcIncrement; + dstPtr3 += vectorIncrement; + } +#endif + for( ; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtr3++ = *srcPtr3; + srcPtr3 += srcGenericDescPtr->strides[1]; + } + srcPtr2 += 1; + dstPtr2 += dstGenericDescPtr->strides[3]; + } + srcPtr1 += srcGenericDescPtr->strides[3]; + dstPtr1 += dstGenericDescPtr->strides[2]; + } + srcPtr0 += srcGenericDescPtr->strides[2]; + dstPtr0 += dstGenericDescPtr->strides[1]; + } + } + else + { + transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims); + } + } + else + { + transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims); + } + } + } + + return RPP_SUCCESS; +} + +template +RppStatus transpose_generic_host_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *permTensor, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u tensorDims = dstGenericDescPtr->numDims - 1; // exclude batchsize from input dims + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + T *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + dstPtrTemp = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + // get the starting address of begin and length values from roiTensor + Rpp32u *roi = roiTensor + batchCount * tensorDims * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[tensorDims]; + Rpp32u *perm = permTensor; + + bool copyInput = true; + for(int i = 0; i < tensorDims; i++) + copyInput *= (perm[i] == i); + + // do memcpy of input to output since output order is same as input order + if(copyInput) + { + memcpy(dstPtrTemp, srcPtrTemp, (size_t)(srcGenericDescPtr->strides[0] * sizeof(T))); + } + else + { + for(int i = 1; i < tensorDims; i++) + srcPtrTemp += begin[i - 1] * srcGenericDescPtr->strides[i]; + transpose_generic_setup_and_run(srcPtrTemp, dstPtrTemp, length, perm, tensorDims); + } + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp index 42e72db98..08eb93674 100644 --- a/src/modules/hip/handlehip.cpp +++ b/src/modules/hip/handlehip.cpp @@ -239,7 +239,12 @@ struct HandleImpl } hipMalloc(&(this->initHandle->mem.mgpu.rgbArr.rgbmem), sizeof(RpptRGB) * this->nBatchSize); - hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 8294400); // 3840 x 2160 + + /* (600000 + 293 + 128) * 128 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 128) + - 600000 is the maximum size that will be required for MMS buffer based on Librispeech dataset + - 293 is the size required for storing reduction outputs for 600000 size sample + - 128 is the size required for storing cutOffDB values for batch size 128 */ + hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888); } }; diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp index 37d2220b2..59e4ba3f9 100644 --- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp +++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp @@ -30,5 +30,6 @@ SOFTWARE. #include "kernel/subtract_scalar.hpp" #include "kernel/multiply_scalar.hpp" #include "kernel/magnitude.hpp" +#include "kernel/log.hpp" #endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp new file mode 100644 index 000000000..6db11e222 --- /dev/null +++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp @@ -0,0 +1,32 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP +#define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP + +#include "kernel/non_silent_region_detection.hpp" +#include "kernel/down_mixing.hpp" +#include "kernel/to_decibels.hpp" + +#endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp index abdfd30ab..12e80a1f4 100644 --- a/src/modules/hip/hip_tensor_effects_augmentations.hpp +++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp @@ -31,6 +31,8 @@ SOFTWARE. #include "kernel/noise_shot.hpp" #include "kernel/noise_gaussian.hpp" #include "kernel/non_linear_blend.hpp" +#include "kernel/jitter.hpp" +#include "kernel/glitch.hpp" #include "kernel/water.hpp" #include "kernel/ricap.hpp" #include "kernel/vignette.hpp" diff --git a/src/modules/hip/hip_tensor_geometric_augmentations.hpp b/src/modules/hip/hip_tensor_geometric_augmentations.hpp index dcd890139..102e7d686 100644 --- a/src/modules/hip/hip_tensor_geometric_augmentations.hpp +++ b/src/modules/hip/hip_tensor_geometric_augmentations.hpp @@ -35,6 +35,8 @@ SOFTWARE. #include "kernel/resize_crop_mirror.hpp" #include "kernel/phase.hpp" #include "kernel/slice.hpp" +#include "kernel/lens_correction.hpp" +#include "kernel/transpose.hpp" #include "kernel/crop_and_patch.hpp" #include "kernel/flip_voxel.hpp" diff --git a/src/modules/hip/kernel/down_mixing.hpp b/src/modules/hip/kernel/down_mixing.hpp new file mode 100644 index 000000000..041780e32 --- /dev/null +++ b/src/modules/hip/kernel/down_mixing.hpp @@ -0,0 +1,72 @@ +#include +#include "rpp_hip_common.hpp" + +__global__ void down_mixing_hip_tensor(float *srcPtr, + uint srcStride, + float *dstPtr, + uint dstStride, + int2 *srcDimsTensor) + +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + int srcLength = srcDimsTensor[id_z].x; + int channels = srcDimsTensor[id_z].y; + + if (id_x >= srcLength) + return; + + float outVal = 0.0f; + uint srcIdx = id_z * srcStride + id_x * channels; + int i = 0; + int alignedChannels = (channels / 8) * 8; + + // do 8 pixel load till alignedChannels value + if (alignedChannels) + { + d_float8 outVal_f8; + outVal_f8.f4[0] = static_cast(0.0f); + outVal_f8.f4[1] = outVal_f8.f4[0]; + for(; i < alignedChannels; i += 8, srcIdx += 8) + { + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + rpp_hip_math_add8(&outVal_f8, &src_f8, &outVal_f8); + } + outVal_f8.f4[0] += outVal_f8.f4[1]; + outVal += (outVal_f8.f1[0] + outVal_f8.f1[1] + outVal_f8.f1[2] + outVal_f8.f1[3]); + } + // process remaining channels + for(; i < channels; i++, srcIdx++) + outVal += srcPtr[srcIdx]; + outVal *= (1.f / channels); + + uint dstIdx = id_z * dstStride + id_x; + dstPtr[dstIdx] = outVal; +} + +RppStatus hip_exec_down_mixing_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcDimsTensor, + bool normalizeWeights, + rpp::Handle& handle) +{ + Rpp32s globalThreads_x = dstDescPtr->strides.nStride; + Rpp32s globalThreads_y = 1; + Rpp32s globalThreads_z = dstDescPtr->n; + + hipLaunchKernelGGL(down_mixing_hip_tensor, + dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + dstPtr, + dstDescPtr->strides.nStride, + reinterpret_cast(srcDimsTensor)); + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/erase.hpp b/src/modules/hip/kernel/erase.hpp index 2591b53f0..f18306a9d 100644 --- a/src/modules/hip/kernel/erase.hpp +++ b/src/modules/hip/kernel/erase.hpp @@ -117,12 +117,34 @@ RppStatus hip_exec_erase_tensor(T *srcPtr, int globalThreads_y = dstDescPtr->h; int globalThreads_z = handle.GetBatchSize(); - if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + if (dstDescPtr->layout == RpptLayout::NHWC) { - if (srcDescPtr->dataType == RpptDataType::U8) + // if src layout is NHWC, copy src to dst + if (srcDescPtr->layout == RpptLayout::NHWC) { - hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp8u)), hipMemcpyDeviceToDevice, handle.GetStream()); + hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(T)), hipMemcpyDeviceToDevice, handle.GetStream()); hipStreamSynchronize(handle.GetStream()); + } + // if src layout is NCHW, convert src from NCHW to NHWC + else if (srcDescPtr->layout == RpptLayout::NCHW) + { + globalThreads_x = (dstDescPtr->w + 7) >> 3; + hipLaunchKernelGGL(convert_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + roiTensorPtrSrc); + globalThreads_x = dstDescPtr->w; + hipStreamSynchronize(handle.GetStream()); + } + + if (srcDescPtr->dataType == RpptDataType::U8) + { hipLaunchKernelGGL(erase_pkd_hip_tensor, dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), @@ -137,8 +159,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr, } else if (srcDescPtr->dataType == RpptDataType::F16) { - hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp16f)), hipMemcpyDeviceToDevice, handle.GetStream()); - hipStreamSynchronize(handle.GetStream()); hipLaunchKernelGGL(erase_pkd_hip_tensor, dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), @@ -153,8 +173,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr, } else if (srcDescPtr->dataType == RpptDataType::F32) { - hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp32f)), hipMemcpyDeviceToDevice, handle.GetStream()); - hipStreamSynchronize(handle.GetStream()); hipLaunchKernelGGL(erase_pkd_hip_tensor, dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), @@ -169,8 +187,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr, } else if (srcDescPtr->dataType == RpptDataType::I8) { - hipMemcpyAsync(dstPtr, srcPtr, static_cast(srcDescPtr->n * srcDescPtr->strides.nStride * sizeof(Rpp8s)), hipMemcpyDeviceToDevice, handle.GetStream()); - hipStreamSynchronize(handle.GetStream()); hipLaunchKernelGGL(erase_pkd_hip_tensor, dim3(ceil((float)globalThreads_x / LOCAL_THREADS_X), ceil((float)globalThreads_y / LOCAL_THREADS_Y), ceil((float)globalThreads_z / LOCAL_THREADS_Z)), dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), @@ -245,33 +261,6 @@ RppStatus hip_exec_erase_tensor(T *srcPtr, numBoxesTensor, roiTensorPtrSrc); } - else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) - { - globalThreads_x = (dstDescPtr->w + 7) >> 3; - hipLaunchKernelGGL(convert_pln3_pkd3_hip_tensor, - dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), - dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), - 0, - handle.GetStream(), - srcPtr, - make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), - dstPtr, - make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), - roiTensorPtrSrc); - hipStreamSynchronize(handle.GetStream()); - globalThreads_x = dstDescPtr->w; - hipLaunchKernelGGL(erase_pkd_hip_tensor, - dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), - dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), - 0, - handle.GetStream(), - dstPtr, - make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), - anchorBoxInfoTensor, - colorsTensor, - numBoxesTensor, - roiTensorPtrSrc); - } } return RPP_SUCCESS; diff --git a/src/modules/hip/kernel/glitch.hpp b/src/modules/hip/kernel/glitch.hpp new file mode 100644 index 000000000..81c7013c0 --- /dev/null +++ b/src/modules/hip/kernel/glitch.hpp @@ -0,0 +1,278 @@ +#include +#include "rpp_hip_common.hpp" + +template +__device__ __forceinline__ void rpp_hip_load1_glitch(T *srcPtr, uint2 srcStrideCH, float &locSrcX, float &locSrcY, float *dst, int channels) +{ + int srcIdx = locSrcY * srcStrideCH.y + locSrcX * srcStrideCH.x + channels; + rpp_hip_interpolate1_nearest_neighbor_load_pln1(srcPtr + srcIdx, dst); +} + +template +__device__ __forceinline__ void rpp_hip_load8_glitch(T *srcPtr, uint2 srcStrideCH, d_float8 *srcX_f8, d_float8 *srcY_f8, d_float8 *dst_f8, int channels) +{ + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[0], srcY_f8->f1[0], &(dst_f8->f1[0]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[1], srcY_f8->f1[1], &(dst_f8->f1[1]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[2], srcY_f8->f1[2], &(dst_f8->f1[2]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[3], srcY_f8->f1[3], &(dst_f8->f1[3]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[4], srcY_f8->f1[4], &(dst_f8->f1[4]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[5], srcY_f8->f1[5], &(dst_f8->f1[5]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[6], srcY_f8->f1[6], &(dst_f8->f1[6]), channels); + rpp_hip_load1_glitch(srcPtr, srcStrideCH, srcX_f8->f1[7], srcY_f8->f1[7], &(dst_f8->f1[7]), channels); +} + +__device__ void check_locs(d_float8 &xLocVals, d_float8 &yLocVals, RppiPoint offset, RpptROI roiTensorPtrSrc) +{ + for(int i = 0; i < 8; i++) + { + if (xLocVals.f1[i] >= roiTensorPtrSrc.ltrbROI.rb.x || xLocVals.f1[i] < roiTensorPtrSrc.ltrbROI.lt.x || yLocVals.f1[i] >= roiTensorPtrSrc.ltrbROI.rb.y || yLocVals.f1[i] < roiTensorPtrSrc.ltrbROI.lt.y) + { + xLocVals.f1[i] -= offset.x; + yLocVals.f1[i] -= offset.y; + } + } +} + +__device__ void compute_glitch_locs_hip(int id_x, int id_y, RpptChannelOffsets rgbOffsets, RpptROI roiTensorPtrSrc, d_float24 *srcLocsX_f24, d_float24 *srcLocsY_f24) +{ + float4 increment_f4; + increment_f4 = make_float4(0.0f, 1.0f, 2.0f, 3.0f); // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs + + srcLocsX_f24->f4[0] = static_cast(id_x + rgbOffsets.r.x) + increment_f4; // find R channel srcLocsX 0, 1, 2, 3 + srcLocsX_f24->f4[1] = srcLocsX_f24->f4[0] + (float4) 4; // find R channel srcLocsX 4, 5, 6, 7 + srcLocsY_f24->f4[0] = srcLocsY_f24->f4[1] = static_cast(id_y + rgbOffsets.r.y); // find R channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7 + check_locs(srcLocsX_f24->f8[0], srcLocsY_f24->f8[0], rgbOffsets.r, roiTensorPtrSrc); // check if all srcLocs in roi bounds + + srcLocsX_f24->f4[2] = static_cast(id_x + rgbOffsets.g.x) + increment_f4; // find G channel srcLocsX 0, 1, 2, 3 + srcLocsX_f24->f4[3] = srcLocsX_f24->f4[2] +(float4) 4; // find G channel srcLocsX 4, 5, 6, 7 + srcLocsY_f24->f4[2] = srcLocsY_f24->f4[3] = static_cast(id_y + rgbOffsets.g.y); // find G channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7 + check_locs(srcLocsX_f24->f8[1], srcLocsY_f24->f8[1], rgbOffsets.g, roiTensorPtrSrc); // check if all srcLocs in roi bounds + + srcLocsX_f24->f4[4] = static_cast(id_x + rgbOffsets.b.x) + increment_f4; // find B channel srcLocsX 0, 1, 2, 3 + srcLocsX_f24->f4[5] = srcLocsX_f24->f4[4] + (float4) 4; // find B channel srcLocsX 4, 5, 6, 7 + srcLocsY_f24->f4[4] = srcLocsY_f24->f4[5] = static_cast(id_y + rgbOffsets.b.y); // find B channel srcLocsY 0, 1, 2, 3 and 4, 5, 6, 7 + check_locs(srcLocsX_f24->f8[2], srcLocsY_f24->f8[2], rgbOffsets.b, roiTensorPtrSrc); // check if all srcLocs in roi bounds +} + +template +__global__ void glitch_pkd_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + RpptChannelOffsets *rgbOffsetsPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z]; + uint2 srcStrideCH = make_uint2(3, srcStridesNH.y); + d_float24 dst_f24, srcLocsX_f24, srcLocsY_f24; + + compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24); + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &(dst_f24.f8[0]), 0); + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &(dst_f24.f8[1]), 1); + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &(dst_f24.f8[2]), 2); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void glitch_pln_hip_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + RpptChannelOffsets *rgbOffsetsPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z]; + uint2 srcStrideCH = make_uint2(1, srcStridesNCH.z); + + d_float24 srcLocsX_f24, srcLocsY_f24; + d_float8 dst_f8; + + compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24); + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &dst_f8, 0); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &dst_f8, 0); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &dst_f8, 0); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void glitch_pkd3_pln3_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + RpptChannelOffsets *rgbOffsetsPtr, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + + RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z]; + uint2 srcStrideCH = make_uint2(3, srcStridesNH.y); + + d_float24 srcLocsX_f24, srcLocsY_f24; + d_float8 dst_f8; + + compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24); + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &dst_f8, 0); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + dstIdx += dstStridesNCH.y; + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &dst_f8, 1); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + dstIdx += dstStridesNCH.y; + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &dst_f8, 2); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void glitch_pln3_pkd3_hip_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + RpptChannelOffsets *rgbOffsetsPtr, + RpptROIPtr roiTensorPtrSrc) +{ + + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3; + + RpptChannelOffsets rgbOffsets = rgbOffsetsPtr[id_z]; + uint2 srcStrideCH = make_uint2(1, srcStridesNCH.z); + + d_float24 dst_f24, srcLocsX_f24, srcLocsY_f24; + compute_glitch_locs_hip(id_x, id_y, rgbOffsets, roiTensorPtrSrc[id_z], &srcLocsX_f24, &srcLocsY_f24); + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[0], &srcLocsY_f24.f8[0], &(dst_f24.f8[0]), 0); + + srcIdx += srcStridesNCH.y; + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[1], &srcLocsY_f24.f8[1], &(dst_f24.f8[1]), 0); + + srcIdx += srcStridesNCH.y; + rpp_hip_load8_glitch(srcPtr + srcIdx, srcStrideCH, &srcLocsX_f24.f8[2], &srcLocsY_f24.f8[2], &(dst_f24.f8[2]), 0); + + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_glitch_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = dstDescPtr->n; + + if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(glitch_pln_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + rgbOffsets, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(glitch_pln3_pkd3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + rgbOffsets, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(glitch_pkd3_pln3_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + rgbOffsets, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(glitch_pkd_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + rgbOffsets, + roiTensorPtrSrc); + } + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/jitter.hpp b/src/modules/hip/kernel/jitter.hpp new file mode 100644 index 000000000..bbc407cda --- /dev/null +++ b/src/modules/hip/kernel/jitter.hpp @@ -0,0 +1,314 @@ +#include +#include "rpp_hip_common.hpp" +#include "rng_seed_stream.hpp" + +__device__ __forceinline__ void jitter_roi_and_srclocs_hip_compute(int4 *srcRoiPtr_i4, RpptXorwowStateBoxMuller *xorwowState, uint kernelSize, uint bound, int id_x, int id_y, d_float16 *locSrc_f16) +{ + d_float8 widthIncrement_f8, heightIncrement_f8; + rpp_hip_rng_8_xorwow_f32(xorwowState, &widthIncrement_f8); + rpp_hip_math_multiply8_const(&widthIncrement_f8, &widthIncrement_f8, static_cast(kernelSize)); + rpp_hip_rng_8_xorwow_f32(xorwowState, &heightIncrement_f8); + rpp_hip_math_multiply8_const(&heightIncrement_f8, &heightIncrement_f8, static_cast(kernelSize)); + + d_float8 increment_f8, locDst_f8x, locDst_f8y; + increment_f8.f4[0] = make_float4(0.0f, 1.0f, 2.0f, 3.0f); // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs + increment_f8.f4[1] = make_float4(4.0f, 5.0f, 6.0f, 7.0f); + locDst_f8x.f4[0] = static_cast(id_x) + increment_f8.f4[0]; + locDst_f8x.f4[1] = static_cast(id_x) + increment_f8.f4[1]; + locDst_f8y.f4[0] = locDst_f8y.f4[1] = (float4)id_y; + + locSrc_f16->f8[0].f4[0] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[0] + widthIncrement_f8.f4[0] - static_cast(bound); + locSrc_f16->f8[0].f4[1] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[1] + widthIncrement_f8.f4[1] - static_cast(bound); + locSrc_f16->f8[1].f4[0] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[0] + heightIncrement_f8.f4[0] - static_cast(bound); + locSrc_f16->f8[1].f4[1] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[1] + heightIncrement_f8.f4[1] - static_cast(bound); + + // Apply boundary checks and adjustments + for(int i = 0; i < 8; ++i) + { + locSrc_f16->f1[i] = fmaxf(fminf(floorf(locSrc_f16->f1[i]), static_cast(srcRoiPtr_i4->z - 1)), 0.0f); + locSrc_f16->f1[i + 8] = fmaxf(fminf(floorf(locSrc_f16->f1[i + 8]), static_cast(srcRoiPtr_i4->w - bound)), 0.0f); + } +} + +template +__global__ void jitter_pkd_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3); + uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float24 dst_f24; + rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void jitter_pln_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float8 dst_f8; + rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void jitter_pkd3_pln3_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float24 dst_f24; + rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24); + rpp_hip_pack_float24_pkd3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void jitter_pln3_pkd3_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3); + uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float24 dst_f24; + rpp_hip_interpolate24_nearest_neighbor_pln3(srcPtr + srcIdx, &srcStridesNCH, &locSrc_f16, &srcRoi_i4, &dst_f24); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_jitter_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + uint *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = dstDescPtr->n; + + Rpp32u *xorwowSeedStream; + xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1]; + CHECK_RETURN_STATUS(hipMemcpyAsync(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice, handle.GetStream())); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(jitter_pkd_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(jitter_pln_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(jitter_pkd3_pln3_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(jitter_pln3_pkd3_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/lens_correction.hpp b/src/modules/hip/kernel/lens_correction.hpp new file mode 100644 index 000000000..0d53db7e1 --- /dev/null +++ b/src/modules/hip/kernel/lens_correction.hpp @@ -0,0 +1,183 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - lens_correction device helpers -------------------- + +__device__ __forceinline__ void camera_coordinates_hip_compute(d_float8 *cameraCoords_f8, int id_y, d_float8 *locDst_f8x, float3 *inverseMatrix) +{ + float4 inverseCoord1_f4 = static_cast(id_y * inverseMatrix->y + inverseMatrix->z); + float4 inverseCoord2_f4 = static_cast(inverseMatrix->x); + cameraCoords_f8->f4[0] = inverseCoord1_f4 + locDst_f8x->f4[0] * inverseCoord2_f4; + cameraCoords_f8->f4[1] = inverseCoord1_f4 + locDst_f8x->f4[1] * inverseCoord2_f4; +} + +// -------------------- Set 1 - lens_correction kernels -------------------- + +// compute inverse of 3x3 camera matrix +__global__ void compute_inverse_matrix_hip_tensor(d_float9 *matTensor, d_float9 *invMatTensor) +{ + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + d_float9 *mat_f9 = &matTensor[id_z]; + d_float9 *invMat_f9 = &invMatTensor[id_z]; + + // initialize all values in invMat_f9 to zero + invMat_f9->f3[0] = static_cast(0.0f); + invMat_f9->f3[1] = invMat_f9->f3[0]; + invMat_f9->f3[2] = invMat_f9->f3[0]; + + // compute determinant mat_f9 + float det = (mat_f9->f1[0] * ((mat_f9->f1[4] * mat_f9->f1[8]) - (mat_f9->f1[7] * mat_f9->f1[5]))) + - (mat_f9->f1[1] * ((mat_f9->f1[3] * mat_f9->f1[8]) - (mat_f9->f1[5] * mat_f9->f1[6]))) + + (mat_f9->f1[2] * ((mat_f9->f1[3] * mat_f9->f1[7]) - (mat_f9->f1[4] * mat_f9->f1[6]))); + if(det != 0) + { + float invDet = 1 / det; + invMat_f9->f1[0] = (mat_f9->f1[4] * mat_f9->f1[8] - mat_f9->f1[7] * mat_f9->f1[5]) * invDet; + invMat_f9->f1[1] = (mat_f9->f1[2] * mat_f9->f1[7] - mat_f9->f1[1] * mat_f9->f1[8]) * invDet; + invMat_f9->f1[2] = (mat_f9->f1[1] * mat_f9->f1[5] - mat_f9->f1[2] * mat_f9->f1[4]) * invDet; + invMat_f9->f1[3] = (mat_f9->f1[5] * mat_f9->f1[6] - mat_f9->f1[3] * mat_f9->f1[8]) * invDet; + invMat_f9->f1[4] = (mat_f9->f1[0] * mat_f9->f1[8] - mat_f9->f1[2] * mat_f9->f1[6]) * invDet; + invMat_f9->f1[5] = (mat_f9->f1[3] * mat_f9->f1[2] - mat_f9->f1[0] * mat_f9->f1[5]) * invDet; + invMat_f9->f1[6] = (mat_f9->f1[3] * mat_f9->f1[7] - mat_f9->f1[6] * mat_f9->f1[4]) * invDet; + invMat_f9->f1[7] = (mat_f9->f1[6] * mat_f9->f1[1] - mat_f9->f1[0] * mat_f9->f1[7]) * invDet; + invMat_f9->f1[8] = (mat_f9->f1[0] * mat_f9->f1[4] - mat_f9->f1[3] * mat_f9->f1[1]) * invDet; + } +} + +// compute remap tables from the camera matrix and distortion coefficients +__global__ void compute_remap_tables_hip_tensor(float *rowRemapTable, + float *colRemapTable, + d_float9 *cameraMatrixTensor, + d_float9 *inverseMatrixTensor, + d_float8 *distortionCoeffsTensor, + uint2 remapTableStridesNH, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + return; + + d_float9 cameraMatrix_f9 = cameraMatrixTensor[id_z]; + d_float9 inverseMatrix_f9 = inverseMatrixTensor[id_z]; + d_float8 distortionCoeffs_f8 = distortionCoeffsTensor[id_z]; + + // Get radial and tangential distortion coefficients + float radialCoeff[6] = {distortionCoeffs_f8.f1[0], distortionCoeffs_f8.f1[1], distortionCoeffs_f8.f1[4], distortionCoeffs_f8.f1[5], distortionCoeffs_f8.f1[6], distortionCoeffs_f8.f1[7]}; + float tangentialCoeff[2] = {distortionCoeffs_f8.f1[2], distortionCoeffs_f8.f1[3]}; + + uint dstIdx = id_z * remapTableStridesNH.x + id_y * remapTableStridesNH.y + id_x; + d_float8 locDst_f8x; + locDst_f8x.f4[0] = static_cast(id_x) + make_float4(0, 1, 2, 3); + locDst_f8x.f4[1] = static_cast(id_x) + make_float4(4, 5, 6, 7); + + float4 one_f4 = static_cast(1.0f); + float4 two_f4 = static_cast(2.0f); + d_float8 z_f8, y_f8, x_f8; + camera_coordinates_hip_compute(&z_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[2]); // float zCamera = id_y * inverseMatrix.f1[7] + inverseMatrix.f1[8] + id_x * inverseMatrix.f1[6] + camera_coordinates_hip_compute(&y_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[1]); // float yCamera = id_y * inverseMatrix.f1[4] + inverseMatrix.f1[5] + id_x * inverseMatrix.f1[3] + camera_coordinates_hip_compute(&x_f8, id_y, &locDst_f8x, &inverseMatrix_f9.f3[0]); // float xCamera = id_y * inverseMatrix.f1[1] + inverseMatrix.f1[2] + id_x * inverseMatrix.f1[0] + rpp_hip_math_divide8_const(&z_f8, &z_f8, one_f4); // float z = 1./zCamera + rpp_hip_math_multiply8(&y_f8, &z_f8, &y_f8); // float y = yCamera * z; + rpp_hip_math_multiply8(&x_f8, &z_f8, &x_f8); // float x = xCamera * z; + + d_float8 ySquare_f8, xSquare_f8; + rpp_hip_math_multiply8(&y_f8, &y_f8, &ySquare_f8); // float ySquare = x * x + rpp_hip_math_multiply8(&x_f8, &x_f8, &xSquare_f8); // float xSquare = x * x + + d_float8 r2_f8, kr_f8, kr1_f8, kr2_f8; + rpp_hip_math_add8(&xSquare_f8, &ySquare_f8, &r2_f8); // float r2 = xSquare + ySquare + + d_float8 r2Cube_f8, r2Square_f8; + rpp_hip_math_multiply8(&r2_f8, &r2_f8, &r2Square_f8); // float r2Square = r2 * r2; + rpp_hip_math_multiply8(&r2Square_f8, &r2_f8, &r2Cube_f8); // float r2Cube = r2Square * r2; + + d_float24 radialCoeff_f24; + radialCoeff_f24.f4[0] = static_cast(radialCoeff[0]); + radialCoeff_f24.f4[1] = static_cast(radialCoeff[1]); + radialCoeff_f24.f4[2] = static_cast(radialCoeff[2]); + radialCoeff_f24.f4[3] = static_cast(radialCoeff[3]); + radialCoeff_f24.f4[4] = static_cast(radialCoeff[4]); + radialCoeff_f24.f4[5] = static_cast(radialCoeff[5]); + + // float kr = (1 + (radialCoeff[2] * r2Cube) + (radialCoeff[1] * r2Square) + (radialCoeff[0]) * r2)) / (1 + (radialCoeff[5] * r2Cube) + (radialCoeff[4] * r2Square) + (radialCoeff[3]) *r2)) + kr1_f8.f4[0] = (one_f4 + (radialCoeff_f24.f4[2] * r2Cube_f8.f4[0]) + (radialCoeff_f24.f4[1] * r2Square_f8.f4[0]) + (radialCoeff_f24.f4[0] * r2_f8.f4[0])); + kr1_f8.f4[1] = (one_f4 + (radialCoeff_f24.f4[2] * r2Cube_f8.f4[1]) + (radialCoeff_f24.f4[1] * r2Square_f8.f4[1]) + (radialCoeff_f24.f4[0] * r2_f8.f4[1])); + kr2_f8.f4[0] = (one_f4 + (radialCoeff_f24.f4[5] * r2Cube_f8.f4[0]) + (radialCoeff_f24.f4[4] * r2Square_f8.f4[0]) + (radialCoeff_f24.f4[3] * r2_f8.f4[0])); + kr2_f8.f4[1] = (one_f4 + (radialCoeff_f24.f4[5] * r2Cube_f8.f4[1]) + (radialCoeff_f24.f4[4] * r2Square_f8.f4[1]) + (radialCoeff_f24.f4[3] * r2_f8.f4[1])); + rpp_hip_math_divide8(&kr1_f8, &kr2_f8, &kr_f8); + + d_float8 xyMul2_f8; + rpp_hip_math_multiply8(&x_f8, &y_f8, &xyMul2_f8); + rpp_hip_math_multiply8_const(&xyMul2_f8, &xyMul2_f8, two_f4); // float xyMul2 = 2 * x * y + + d_float8 colLoc_f8, rowLoc_f8; + rpp_hip_math_multiply8_const(&xSquare_f8, &xSquare_f8, two_f4); // xSquare = xSquare * 2; + rpp_hip_math_multiply8_const(&ySquare_f8, &ySquare_f8, two_f4); // ySquare = ySquare * 2; + + d_float16 cameraMatrix_f16; + cameraMatrix_f16.f4[0] = static_cast(cameraMatrix_f9.f1[0]); + cameraMatrix_f16.f4[1] = static_cast(cameraMatrix_f9.f1[2]); + cameraMatrix_f16.f4[2] = static_cast(cameraMatrix_f9.f1[4]); + cameraMatrix_f16.f4[3] = static_cast(cameraMatrix_f9.f1[5]); + + d_float8 tangentialCoeff_f8; + tangentialCoeff_f8.f4[0] = static_cast(tangentialCoeff[0]); + tangentialCoeff_f8.f4[1] = static_cast(tangentialCoeff[1]); + + // float colLoc = cameraMatrix[0] * (x * kr + tangentialCoeff[0] * xyMul2 + tangentialCoeff[1] * (r2 + 2 * xSquare)) + cameraMatrix[2]; + colLoc_f8.f4[0] = cameraMatrix_f16.f4[0] * ((x_f8.f4[0] * kr_f8.f4[0]) + (tangentialCoeff_f8.f4[0] * xyMul2_f8.f4[0]) + (tangentialCoeff_f8.f4[1] * (r2_f8.f4[0] + xSquare_f8.f4[0]))) + cameraMatrix_f16.f4[1]; + colLoc_f8.f4[1] = cameraMatrix_f16.f4[0] * ((x_f8.f4[1] * kr_f8.f4[1]) + (tangentialCoeff_f8.f4[0] * xyMul2_f8.f4[1]) + (tangentialCoeff_f8.f4[1] * (r2_f8.f4[1] + xSquare_f8.f4[1]))) + cameraMatrix_f16.f4[1]; + + // float rowLoc = cameraMatrix[4] * (y * kr + tangentialCoeff[1] * xyMul2 + tangentialCoeff[0] * (r2 + 2 * ySquare)) + cameraMatrix[4]; + rowLoc_f8.f4[0] = cameraMatrix_f16.f4[2] * ((y_f8.f4[0] * kr_f8.f4[0]) + (tangentialCoeff_f8.f4[1] * xyMul2_f8.f4[0]) + (tangentialCoeff_f8.f4[0] * (r2_f8.f4[0] + ySquare_f8.f4[0]))) + cameraMatrix_f16.f4[3]; + rowLoc_f8.f4[1] = cameraMatrix_f16.f4[2] * ((y_f8.f4[1] * kr_f8.f4[1]) + (tangentialCoeff_f8.f4[1] * xyMul2_f8.f4[1]) + (tangentialCoeff_f8.f4[0] * (r2_f8.f4[1] + ySquare_f8.f4[1]))) + cameraMatrix_f16.f4[3]; + + rpp_hip_pack_float8_and_store8(colRemapTable + dstIdx, &colLoc_f8); + rpp_hip_pack_float8_and_store8(rowRemapTable + dstIdx, &rowLoc_f8); +} + +// -------------------- Set 2 - Kernel Executors -------------------- + +RppStatus hip_exec_lens_correction_tensor(RpptDescPtr dstDescPtr, + Rpp32f *rowRemapTable, + Rpp32f *colRemapTable, + RpptDescPtr remapTableDescPtr, + Rpp32f *cameraMatrix, + Rpp32f *distanceCoeffs, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->w + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = dstDescPtr->n; + + float *inverseMatrix = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + hipLaunchKernelGGL(compute_inverse_matrix_hip_tensor, + dim3(1, 1, ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(1, 1, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + reinterpret_cast(cameraMatrix), + reinterpret_cast(inverseMatrix)); + hipLaunchKernelGGL(compute_remap_tables_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + rowRemapTable, + colRemapTable, + reinterpret_cast(cameraMatrix), + reinterpret_cast(inverseMatrix), + reinterpret_cast(distanceCoeffs), + make_uint2(remapTableDescPtr->strides.nStride, remapTableDescPtr->strides.hStride), + roiTensorPtrSrc); + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/log.hpp b/src/modules/hip/kernel/log.hpp new file mode 100644 index 000000000..a481a1e07 --- /dev/null +++ b/src/modules/hip/kernel/log.hpp @@ -0,0 +1,232 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 1 - helper kernels -------------------- +template +__device__ void log_hip_compute(T *srcPtr, d_float8 *src_f8, d_float8 *dst_f8) +{ + if constexpr (std::is_same::value) + rpp_hip_math_add8_const(src_f8, src_f8, (float4)128); + + rpp_hip_math_log(src_f8, dst_f8); +} + +// -------------------- Set 2 - log kernels -------------------- +template +__global__ void log_1d_hip_tensor(T *srcPtr, + uint srcStrides, + U *dstPtr, + uint dstStrides, + uint *roiTensor) +{ + uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + uint *roi = &roiTensor[id_z * 2]; + uint beginX = roi[0]; + uint width = roi[1]; + + if (id_x >= width) + return; + + uint srcIdx = (id_z * srcStrides) + id_x + beginX; + uint dstIdx = (id_z * dstStrides) + id_x; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void log_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + U *dstPtr, + uint2 dstStridesNH, + uint *roiTensor) +{ + uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + uint *roi = &roiTensor[id_z * 4]; + uint beginY = roi[0]; + uint beginX = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + if (id_x >= width || id_y >= height) + return; + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + beginY) * srcStridesNH.y) + id_x + beginX; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void log_3d_hip_tensor(T *srcPtr, + uint2 srcStridesDH, + U *dstPtr, + uint2 dstStridesDH, + uint *roiTensor) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ + + uint *roi = roiTensor; + uint beginZ = roi[0]; + uint beginY = roi[1]; + uint beginX = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ) + return; + + uint srcIdx = ((id_z + beginZ) * srcStridesDH.x) + ((id_y + beginY) * srcStridesDH.y) + id_x + beginX; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void log_nd_hip_tensor(T *srcPtr, + uint *srcStrides, + uint *srcDims, + uint numDims, + U *dstPtr, + uint *dstStrides, + Rpp32u *roiTensor) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + if(id_x >= srcStrides[0]) + return; + + uint *roi = roiTensor + id_z * numDims * 2; + uint *begin = roi; + uint *length = &roi[numDims]; + uint dstIdx = (id_z * *dstStrides++); + uint srcIdx = (id_z * *srcStrides++); + uint coords[RPPT_MAX_DIMS]; + + for (int i = 0; i < numDims; i++) + { + coords[i] = (id_x / srcStrides[i]) % srcDims[i]; + if(coords[i] >= length[i]) + return; + } + + for (int i = 0; i < numDims; i++) + { + dstIdx += (coords[i] * dstStrides[i]); + srcIdx += (begin[i] + (coords[i] * srcStrides[i])); + } + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +// -------------------- Set 3 - executor kernels -------------------- +template +RppStatus hip_exec_log_generic_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + U *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + uint *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims + // based on number of dimensions call the corresponding kernel + if (numDims == 1) + { + // NW + int globalThreads_x = dstGenericDescPtr->dims[1]; + int globalThreads_y = 1; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(log_1d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + srcGenericDescPtr->strides[0], + dstPtr, + dstGenericDescPtr->strides[0], + roiTensor); + } + else if (numDims == 2) + { + // NHW + int globalThreads_x = dstGenericDescPtr->dims[2]; + int globalThreads_y = dstGenericDescPtr->dims[1]; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(log_2d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + dstPtr, + make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]), + roiTensor); + } + else if (numDims == 3) + { + // NDHW + int globalThreads_x = dstGenericDescPtr->dims[3]; + int globalThreads_y = dstGenericDescPtr->dims[2]; + int globalThreads_z = dstGenericDescPtr->dims[1]; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(log_3d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + &roiTensor[batchCount * 6]); + } + } + else + { + // interpret the input as 1D tensor + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; + int globalThreads_y = 1; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(log_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + srcPtr, + srcGenericDescPtr->strides, + srcGenericDescPtr->dims + 1, + srcGenericDescPtr->numDims - 1, + dstPtr, + dstGenericDescPtr->strides, + roiTensor); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/non_silent_region_detection.hpp b/src/modules/hip/kernel/non_silent_region_detection.hpp new file mode 100644 index 000000000..80511464b --- /dev/null +++ b/src/modules/hip/kernel/non_silent_region_detection.hpp @@ -0,0 +1,426 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - moving mean square kernel device helpers -------------------- + +// calculate the position in shared memory to avoid bank conflicts +__host__ __device__ __forceinline__ int compute_pos_in_smem(int pos) +{ + return pos + (pos >> 5); // since shared memory banks considered is 32 +} + +/* compute prefix sum on the input buffer passed + prefix sum of an array is an array where each element is the sum of all previous elements in the input array, inclusive of the current element */ +__device__ __forceinline__ void compute_prefix_sum(float *input, uint bufferLength) +{ + int offset = 1; + int2 offset_i2 = static_cast(offset); + int2 offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1); + int threadIdxMul2 = 2 * hipThreadIdx_x; + int blockDimMul2 = 2 * hipBlockDim_x; + + /* compute intermediate prefix sums in a up sweep manner + (each level in the hierarchy doubles the distance between the pairs of elements being added) */ + for (int d = bufferLength >> 1; d > 0; d >>= 1) + { + // syncthreads before proceeding to next iteration + __syncthreads(); + int dMul2 = 2 * d; + for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2) + { + int2 pos_i2 = (offset_i2 * static_cast(idxMul2)) + offsetAB_i2; + input[compute_pos_in_smem(pos_i2.y)] += input[compute_pos_in_smem(pos_i2.x)]; + } + offset <<= 1; + offset_i2 = static_cast(offset); + offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1); + } + + if (hipThreadIdx_x == 0) + { + int last = bufferLength - 1; + input[compute_pos_in_smem(last)] = 0; + } + + /* compute final prefix sums in a down sweep manner + (each level in the hierarchy halves the distance between the pairs of elements being added) */ + for (int d = 1; d < bufferLength; d <<= 1) + { + offset >>= 1; + offset_i2 = static_cast(offset); + offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1); + __syncthreads(); + // syncthreads before proceeding to next iteration + + int dMul2 = 2 * d; + for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2) + { + int2 pos_i2 = offset_i2 * static_cast(idxMul2) + offsetAB_i2; + int posA = compute_pos_in_smem(pos_i2.x); + int posB = compute_pos_in_smem(pos_i2.y); + float t = input[posA]; + input[posA] = input[posB]; + input[posB] += t; + } + } + __syncthreads(); +} + +// -------------------- Set 1 - moving mean square compute kernel -------------------- + +__global__ void moving_mean_square_hip_tensor(float *srcPtr, + uint nStride, + float *mmsArr, + int *srcLengthTensor, + int outputTileLength, + int windowLength, + float windowFactor, + int inputTileLength) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint srcLength = srcLengthTensor[id_z]; + uint batchStride = id_z * nStride; + int blockStart = hipBlockIdx_x * outputTileLength; + + if (blockStart >= srcLength) + return; + + float *input = srcPtr + batchStride; + extern __shared__ float squaredPrefixSum_smem[]; + + float *inBlockPtr = srcPtr + batchStride + blockStart; + float *outBlockPtr = mmsArr + batchStride + blockStart; + + // find the valid output tile length values needed for given block + int validOutputTileLength = std::min(outputTileLength, srcLength - blockStart); + + // assign pointers that points to block begin and block end locations + float *extendedBlockStart = inBlockPtr - windowLength; + float *extendedBlockEnd = inBlockPtr + validOutputTileLength; + + // load input data to shared memory + for(int pos = hipThreadIdx_x; pos < inputTileLength; pos += hipBlockDim_x) + { + float val = 0.0f; + auto extendedBlockPtr = extendedBlockStart + pos; + + /* check if extendedBlockPtr is within the valid region of input + and load the value from extendedBlockPtr if it is within valid region */ + if (extendedBlockPtr >= input && extendedBlockPtr < extendedBlockEnd) + val = *extendedBlockPtr; + squaredPrefixSum_smem[compute_pos_in_smem(pos)] = val * val; + } + + // compute prefix sum + compute_prefix_sum(squaredPrefixSum_smem, inputTileLength); + + // compute the mms value here + for(int pos = hipThreadIdx_x; pos < validOutputTileLength; pos += hipBlockDim_x) + outBlockPtr[pos] = windowFactor * ((inBlockPtr[pos] * inBlockPtr[pos]) + squaredPrefixSum_smem[compute_pos_in_smem(windowLength + pos)] - squaredPrefixSum_smem[compute_pos_in_smem(pos + 1)]); +} + +// -------------------- Set 2 - kernels for finding cutoffmag value -------------------- + +__global__ void max_reduction_hip_tensor(float *srcPtr, + uint nStride, + float *maxArr, + int *srcLengthTensor) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint srcLength = srcLengthTensor[id_z]; + + uint srcIdx = id_z * nStride; + __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block + max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads + + if (id_x >= srcLength) + return; + + if (id_x + 8 > srcLength) + id_x -= (id_x + 8 - srcLength); + + srcIdx += id_x; + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int dstIdx = id_z * hipGridDim_x + hipBlockIdx_x; + maxArr[dstIdx] = max_smem[0]; + } +} + +__global__ void cutoffmag_hip_tensor(float *srcPtr, + int maxLength, + float *cutOffMagPtr, + float cutOff, + float referencePower, + bool referenceMax) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + // if referenceMax is set to true, perform final max reduction on srcPtr and compute cutOffMag + if(referenceMax) + { + uint srcIdx = id_z * maxLength; + __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block + max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads + + if (id_x >= maxLength) + return; + + srcIdx += id_x; + float maxVal = srcPtr[srcIdx]; + while (id_x < maxLength) + { + maxVal = fmaxf(maxVal, srcPtr[srcIdx]); + id_x += hipBlockDim_x; + srcIdx += hipBlockDim_x; + } + max_smem[hipThreadIdx_x] = maxVal; + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + cutOffMagPtr[id_z] = max_smem[0] * cutOff; + } + else + { + if (hipThreadIdx_x == 0) + cutOffMagPtr[id_z] = referencePower * cutOff; + } +} + +// -------------------- Set 3 - kernels for finding begin and length of NSR in inputs -------------------- + +__global__ void find_region_hip_tensor(float *srcPtr, + uint nStride, + int *beginTensor, + int *lengthTensor, + float *cutOffMagPtr, + int *srcLengthTensor, + float windowLength) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint srcLength = srcLengthTensor[id_z]; + float cutOffMag = cutOffMagPtr[id_z]; + + __shared__ int beginResult; + __shared__ int endResult; + beginResult = srcLength; + endResult = 0; + __syncthreads(); + + int beginIdx = srcLength; + int endIdx = 0; + uint stridePerSample = id_z * nStride; + + // Find the begin index in src whose value is >= cutOffMag + for (int i = id_x; i < srcLength; i += hipBlockDim_x) + { + uint srcIdx = stridePerSample + i; + if (srcPtr[srcIdx] >= cutOffMag) + { + beginIdx = i; + atomicMin(&beginResult, beginIdx); + if(beginResult != srcLength) + break; + } + } + + // Find the end index in src whose value is >= cutOffMag + for (int i = id_x; i < srcLength; i += hipBlockDim_x) + { + uint srcIdx = stridePerSample + srcLength - 1 - i; + if (srcPtr[srcIdx] >= cutOffMag) + { + endIdx = srcLength - 1 - i; + atomicMax(&endResult, endIdx); + if(endResult != 0) + break; + } + } + + // Final store to dst + if(hipThreadIdx_x == 0) + { + if(beginResult == srcLength || endResult == 0) + { + beginTensor[id_z] = 0; + lengthTensor[id_z] = 0; + } + else + { + int detectBegin = beginResult; + int detectEnd = endResult - beginResult + 1; + + // if both starting index and length of nonsilent region is not 0 + // adjust the values as per the windowLength + if(detectBegin != 0 && detectEnd != 0) + { + int newBegin = max(detectBegin - (windowLength - 1), 0); + detectEnd += detectBegin - newBegin; + detectBegin = newBegin; + } + beginTensor[id_z] = detectBegin; + lengthTensor[id_z] = detectEnd; + } + } +} + +// -------------------- Set 4 - host helpers for kernel executor -------------------- + +// return the nearest previous power of 2 for the given number +inline Rpp32s prev_pow2(Rpp32s n) +{ + Rpp32s pow2 = 1; + while (n - pow2 > pow2) + pow2 += pow2; + + return pow2; +} + +// return the nearest next power of 2 for the given number +inline Rpp32s next_pow2(Rpp32s n) +{ + Rpp32s pow2 = 1; + while (n > pow2) + pow2 += pow2; + + return pow2; +} + +// -------------------- Set 5 - non silent region kernels executor -------------------- + +RppStatus hip_exec_non_silent_region_detection_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32s *srcLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, + Rpp32f cutOffDB, + Rpp32s windowLength, + Rpp32f referencePower, + Rpp32s resetInterval, + rpp::Handle& handle) +{ + // check if scratch memory size required for moving mean square is within the limits + if ((srcDescPtr->n * srcDescPtr->strides.nStride) > MMS_MAX_SCRATCH_MEMORY) + return RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE; + + Rpp32f *mmsArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + Rpp32s maxSharedMemoryInBytes = handle.GetLocalMemorySize(); + Rpp32s maxSharedMemoryElements = maxSharedMemoryInBytes / sizeof(Rpp32f); + Rpp32s kSharedMemBanks = 32; + Rpp32s inputTileLength = prev_pow2(maxSharedMemoryElements * kSharedMemBanks / (kSharedMemBanks + 1)); + + if (resetInterval > 0 && resetInterval < inputTileLength) + { + Rpp32s p = prev_pow2(resetInterval); + Rpp32s n = next_pow2(resetInterval); + if (p > windowLength) + inputTileLength = p; + else if (n < inputTileLength) + inputTileLength = n; + } + + Rpp32s sharedMemorySizeInBytes = compute_pos_in_smem(inputTileLength) * sizeof(Rpp32f); + Rpp32s outputTileLength = inputTileLength - windowLength; + Rpp32f windowFactor = 1.0f / windowLength; + + if (outputTileLength <= 0) + return RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH; + + if (sharedMemorySizeInBytes > maxSharedMemoryInBytes) + return RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE; + + // launch kernel to compute the values needed for MMS Array + Rpp32s globalThreads_x = ceil(static_cast(srcDescPtr->strides.nStride) / outputTileLength); + Rpp32s globalThreads_y = 1; + Rpp32s globalThreads_z = srcDescPtr->n; + + hipLaunchKernelGGL(moving_mean_square_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + sharedMemorySizeInBytes, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + mmsArr, + srcLengthTensor, + outputTileLength, + windowLength, + windowFactor, + inputTileLength); + + const Rpp32f cutOff = std::pow(10.0f, cutOffDB * 0.1f); + bool referenceMax = (!referencePower); + Rpp32f *partialMaxArr = mmsArr + srcDescPtr->n * srcDescPtr->strides.nStride; + + Rpp32s numBlocksPerSample = ceil(static_cast(srcDescPtr->strides.nStride) / (LOCAL_THREADS_X_1DIM * 8)); + Rpp32s cutOffMagKernelBlockSize = 1; + if (referenceMax) + { + // compute max value in MMS buffer + hipLaunchKernelGGL(max_reduction_hip_tensor, + dim3(numBlocksPerSample, 1, globalThreads_z), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + mmsArr, + srcDescPtr->strides.nStride, + partialMaxArr, + srcLengthTensor); + cutOffMagKernelBlockSize = 256; + } + // find the cutoff value in magnitude + Rpp32f *cutOffMagPtr = partialMaxArr + globalThreads_z * numBlocksPerSample; + hipLaunchKernelGGL(cutoffmag_hip_tensor, + dim3(1, 1, globalThreads_z), + dim3(cutOffMagKernelBlockSize, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + partialMaxArr, + numBlocksPerSample, + cutOffMagPtr, + cutOff, + referencePower, + referenceMax); + + // find the begin and length values of NSR in inputs + hipLaunchKernelGGL(find_region_hip_tensor, + dim3(1, 1, globalThreads_z), + dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + mmsArr, + srcDescPtr->strides.nStride, + detectedIndexTensor, + detectionLengthTensor, + cutOffMagPtr, + srcLengthTensor, + windowLength); + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/to_decibels.hpp b/src/modules/hip/kernel/to_decibels.hpp new file mode 100644 index 000000000..e1d45d098 --- /dev/null +++ b/src/modules/hip/kernel/to_decibels.hpp @@ -0,0 +1,312 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - to_decibels device helpers -------------------- + +__device__ __forceinline__ void to_decibels_hip_compute(d_float8 *src_f8, d_float8 *dst_f8, double minRatio, float multiplier, float inverseMagnitude) +{ + dst_f8->f1[0] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[0]) * inverseMagnitude))); + dst_f8->f1[1] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[1]) * inverseMagnitude))); + dst_f8->f1[2] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[2]) * inverseMagnitude))); + dst_f8->f1[3] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[3]) * inverseMagnitude))); + dst_f8->f1[4] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[4]) * inverseMagnitude))); + dst_f8->f1[5] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[5]) * inverseMagnitude))); + dst_f8->f1[6] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[6]) * inverseMagnitude))); + dst_f8->f1[7] = multiplier * log2(max(minRatio, (static_cast(src_f8->f1[7]) * inverseMagnitude))); +} + +// -------------------- Set 1 - kernels for finding inverse magnitude value -------------------- + +__global__ void inverse_magnitude_hip_tensor(float *srcPtr, + int maxLength, + bool computeMax, + float *inverseMagnitudeTensor) + +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + // Do final reduction on block wise max + if (computeMax) + { + uint srcIdx = id_z * maxLength; + __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block + max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads + + if (id_x >= maxLength) + return; + + srcIdx += id_x; + float maxVal = srcPtr[srcIdx]; + while (id_x < maxLength) + { + maxVal = fmaxf(maxVal, srcPtr[srcIdx]); + id_x += hipBlockDim_x; + srcIdx += hipBlockDim_x; + } + max_smem[hipThreadIdx_x] = maxVal; + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + inverseMagnitudeTensor[id_z] = 1.f / max_smem[0]; + } + else + { + inverseMagnitudeTensor[id_z] = 1.0f; + } +} + +__global__ void max_reduction_1d_hip_tensor(float *srcPtr, + uint2 srcStridesNH, + RpptImagePatchPtr srcDims, + float *maxArr) +{ + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + + uint srcLength = srcDims[id_z].height; + uint srcIdx = id_z * srcStridesNH.x; + __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block + max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads + + if (id_x >= srcLength) + return; + + srcIdx += id_x; + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + maxArr[id_z * hipGridDim_x + hipBlockIdx_x] = max_smem[0]; +} + +__global__ void max_reduction_2d_hip_tensor(float *srcPtr, + uint2 srcStridesNH, + RpptImagePatchPtr srcDims, + float *maxArr) +{ + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + + __shared__ float partialMax_smem[16][16]; // 16 rows of src, 16 reduced cols of src in a 16 x 16 thread block + uint srcIdx = (id_z * srcStridesNH.x); + float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS + partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads + + if ((id_y >= srcDims[id_z].height) || (id_x >= srcDims[id_z].width)) + return; + + srcIdx += ((id_y * srcStridesNH.y) + id_x); + partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; + __syncthreads(); // syncthreads + + // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension) + for (int threadMax = 8; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + if (hipThreadIdx_x == 0) + { + // Reduction of 16 floats on 16 threads per block in y dimension + for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2) + { + if (hipThreadIdx_y < threadMax) + partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_y == 0) + maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0]; + } +} + +// -------------------- Set 2 - to decibels kernels -------------------- + +__global__ void to_decibels_1d_hip_tensor(float *srcPtr, + uint srcStride, + float *dstPtr, + uint dstStride, + RpptImagePatchPtr srcDims, + double minRatio, + float multiplier, + float *inverseMagnitudeTensor) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if (id_x >= srcDims[id_z].height) + return; + + uint srcIdx = (id_z * srcStride) + id_x; + float inverseMagnitude = inverseMagnitudeTensor[id_z]; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + to_decibels_hip_compute(&src_f8, &dst_f8, minRatio, multiplier, inverseMagnitude); + + uint dstIdx = (id_z * dstStride) + id_x; + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +__global__ void to_decibels_2d_hip_tensor(float *srcPtr, + uint2 srcStridesNH, + float *dstPtr, + uint2 dstStridesNH, + RpptImagePatchPtr srcDims, + double minRatio, + float multiplier, + float *inverseMagnitudeTensor) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if (id_x >= srcDims[id_z].width || id_y >= srcDims[id_z].height) + return; + + uint srcIdx = (id_z * srcStridesNH.x) + (id_y * srcStridesNH.y) + id_x; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x; + float inverseMagnitude = inverseMagnitudeTensor[id_z]; + dstPtr[dstIdx] = multiplier * log2(max(minRatio, (static_cast(srcPtr[srcIdx]) * inverseMagnitude))); +} + +// -------------------- Set 3 - to decibels kernels executor -------------------- + +RppStatus hip_exec_to_decibels_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + RpptImagePatchPtr srcDims, + Rpp32f cutOffDB, + Rpp32f multiplier, + Rpp32f referenceMagnitude, + rpp::Handle& handle) +{ + Rpp32u numDims = srcDescPtr->numDims - 1; // exclude batchSize from input dims + + // Calculate the intermediate values needed for DB conversion + Rpp32f minRatio = std::pow(10, cutOffDB / multiplier); + if(!minRatio) + minRatio = std::nextafter(0.0f, 1.0f); + const Rpp32f log10Factor = 0.3010299956639812; //1 / std::log(10); + multiplier *= log10Factor; + + // calculate max in input if referenceMagnitude = 0 + Rpp32f *partialMaxArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + Rpp32s numBlocksPerSample = 0; + Rpp32s globalThreads_z = dstDescPtr->n; + + // find the invReferenceMagnitude value + bool computeMax = (!referenceMagnitude); + if(computeMax) + { + if (numDims == 1) + { + numBlocksPerSample = ceil(static_cast((srcDescPtr->strides.nStride + 7) >> 3) / LOCAL_THREADS_X_1DIM); + hipLaunchKernelGGL(max_reduction_1d_hip_tensor, + dim3(numBlocksPerSample, 1, globalThreads_z), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, 1), + srcDims, + partialMaxArr); + } + else if (numDims == 2) + { + Rpp32s gridDim_x = ceil(static_cast((srcDescPtr->strides.hStride)/LOCAL_THREADS_X)); + Rpp32s gridDim_y = ceil(static_cast(srcDescPtr->h)/LOCAL_THREADS_Y); + Rpp32s gridDim_z = ceil(static_cast(globalThreads_z)/LOCAL_THREADS_Z); + numBlocksPerSample = gridDim_x * gridDim_y * gridDim_z; + hipLaunchKernelGGL(max_reduction_2d_hip_tensor, + dim3(gridDim_x, gridDim_y, gridDim_z), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + srcDims, + partialMaxArr); + } + hipStreamSynchronize(handle.GetStream()); + } + Rpp32u blockSize = (computeMax) ? 256: 1; + Rpp32f *inverseMagnitudeTensor = partialMaxArr + globalThreads_z * numBlocksPerSample; + hipLaunchKernelGGL(inverse_magnitude_hip_tensor, + dim3(1, 1, globalThreads_z), + dim3(blockSize, 1, 1), + 0, + handle.GetStream(), + partialMaxArr, + numBlocksPerSample, + computeMax, + inverseMagnitudeTensor); + hipStreamSynchronize(handle.GetStream()); + + // launch kernel for todecibels + if (numDims == 1) + { + Rpp32s globalThreads_x = (srcDescPtr->strides.nStride + 7) >> 3; + Rpp32s globalThreads_y = 1; + hipLaunchKernelGGL(to_decibels_1d_hip_tensor, + dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + dstPtr, + dstDescPtr->strides.nStride, + srcDims, + static_cast(minRatio), + multiplier, + inverseMagnitudeTensor); + } + else if (numDims == 2) + { + Rpp32s globalThreads_x = srcDescPtr->strides.hStride; + Rpp32s globalThreads_y = srcDescPtr->h; + hipLaunchKernelGGL(to_decibels_2d_hip_tensor, + dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + srcDims, + static_cast(minRatio), + multiplier, + inverseMagnitudeTensor); + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/transpose.hpp b/src/modules/hip/kernel/transpose.hpp new file mode 100644 index 000000000..83f2ba700 --- /dev/null +++ b/src/modules/hip/kernel/transpose.hpp @@ -0,0 +1,105 @@ +#include +#include "rpp_hip_common.hpp" + +// Vectorized dst->src mapping +template +__global__ void transpose_generic_hip_tensor(T *srcPtr, + uint *srcStrides, + T *dstPtr, + uint *dstStrides, + uint *dstDims, + uint tensorDims, + uint *permTensor) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + + if(id_x >= dstStrides[0]) + return; + + int maxLength = dstStrides[0]; + int xDiff = maxLength - (maxLength & ~7); // difference between maxLength and alignedLength. (alignedLength = maxLength & ~7) + + // Point dstIdx and srcIdx to be at the start of given input tensor in batch + uint dstIdx = (id_y * *dstStrides++); // post-increment dstStrides pointer by 1 to exclude outermost batch-dimension stride (for example exclude nStride in an NCDHW tensor) + uint srcIdx = (id_y * *srcStrides++); // post-increment srcStrides pointer by 1 to exclude outermost batch-dimension stride (for example exclude nStride in an NCDHW tensor) + + d_uint8 dstCoords[RPPT_MAX_DIMS], srcIdxs; + uint4 idx0123 = make_uint4(id_x, id_x + 1, id_x + 2, id_x + 3); // get idx for elements 0, 1, 2, 3 in the 8-element vectorized kernel + uint4 idx4567 = make_uint4(id_x + 4, id_x + 5, id_x + 6, id_x + 7); // get idx for elements 4, 5, 6, 7 in the 8-element vectorized kernel + srcIdxs.ui4[0] = srcIdxs.ui4[1] = make_uint4(srcIdx, srcIdx, srcIdx, srcIdx); // create 8-element vectorized srcIdxs + + // Compute 8 dstCoords given idx0123 and idx4567, corresponding to the 8 srcCoords processed in a thread + for (int i = 0; i < tensorDims; i++) + { + dstCoords[i].ui4[0] = (idx0123 / dstStrides[i]) % dstDims[i]; // transpose 4 srcCoords using idx0123 to 4 dstCoords in dstCoords[i].ui4[0] for the ith tensor dimension + dstCoords[i].ui4[1] = (idx4567 / dstStrides[i]) % dstDims[i]; // transpose 4 srcCoords using idx4567 to 4 dstCoords in dstCoords[i].ui4[1] for the ith tensor dimension + } + + // Compute corresponding 8 srcIdxs given id_x + for (int i = 0; i < tensorDims; i++) + { + uint4 srcStrides_ui4 = static_cast(srcStrides[permTensor[permTensor[i]]]); + srcIdxs.ui4[0] += (dstCoords[permTensor[i]].ui4[0] * srcStrides_ui4); // incrementally adding respective (coordinate value * stride) to get srcIdxs for 0, 1, 2, 3 elements + srcIdxs.ui4[1] += (dstCoords[permTensor[i]].ui4[1] * srcStrides_ui4); // incrementally adding respective (coordinate value * stride) to get srcIdxs for 4, 5, 6, 7 elements + dstIdx += (dstCoords[i].ui1[0] * dstStrides[i]); + } + + // Move srcIdx to access next input tensor once id_x goes beyond present tensor + if((id_x + 8) > maxLength) + for(int i = xDiff; i < 8; i++) + srcIdxs.ui1[i] += maxLength; + + // Load corresponding 8 src pixels from computed src idx values + d_float8 dst_f8; + dst_f8.f1[0] = static_cast(srcPtr[srcIdxs.ui1[0]]); + dst_f8.f1[1] = static_cast(srcPtr[srcIdxs.ui1[1]]); + dst_f8.f1[2] = static_cast(srcPtr[srcIdxs.ui1[2]]); + dst_f8.f1[3] = static_cast(srcPtr[srcIdxs.ui1[3]]); + dst_f8.f1[4] = static_cast(srcPtr[srcIdxs.ui1[4]]); + dst_f8.f1[5] = static_cast(srcPtr[srcIdxs.ui1[5]]); + dst_f8.f1[6] = static_cast(srcPtr[srcIdxs.ui1[6]]); + dst_f8.f1[7] = static_cast(srcPtr[srcIdxs.ui1[7]]); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +RppStatus hip_exec_transpose_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + T *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *permTensor, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + // Check for feasibility of direct copy from input to output if no permutation detected + bool copyInput = true; + for(int i = 0; i < dstGenericDescPtr->numDims - 1; i++) + copyInput *= (permTensor[i] == i); + + if (copyInput) + { + CHECK_RETURN_STATUS(hipMemcpyAsync(dstPtr, srcPtr, dstGenericDescPtr->dims[0] * dstGenericDescPtr->strides[0] * sizeof(T), hipMemcpyDeviceToDevice, handle.GetStream())); + } + else + { + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; + int globalThreads_y = dstGenericDescPtr->dims[0]; + int globalThreads_z = 1; + + hipLaunchKernelGGL(transpose_generic_hip_tensor, + dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + srcPtr, + srcGenericDescPtr->strides, + dstPtr, + dstGenericDescPtr->strides, + dstGenericDescPtr->dims + 1, + dstGenericDescPtr->numDims - 1, + permTensor); + } + + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp index 8f88ba90f..bac68a4a1 100644 --- a/src/modules/rppt_tensor_arithmetic_operations.cpp +++ b/src/modules/rppt_tensor_arithmetic_operations.cpp @@ -255,6 +255,57 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, return RPP_SUCCESS; } +/******************** log ********************/ + +RppStatus rppt_log_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -454,4 +505,59 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, #endif // backend } +/******************** log ********************/ + +RppStatus rppt_log_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index 0267985e5..c98832f3c 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -22,11 +22,17 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#ifdef AUDIO_SUPPORT + #include "rppdefs.h" #include "rppi_validate.hpp" #include "rppt_tensor_audio_augmentations.h" #include "cpu/host_tensor_audio_augmentations.hpp" +#ifdef HIP_COMPILE + #include "hip/hip_tensor_audio_augmentations.hpp" +#endif // HIP_COMPILE + /******************** non_silent_region_detection ********************/ RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, @@ -268,3 +274,129 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr, return RPP_ERROR_NOT_IMPLEMENTED; } } + +/********************************************************************************************************************/ +/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ +/********************************************************************************************************************/ + +#ifdef GPU_SUPPORT + +/******************** non_silent_region_detection ********************/ + +RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + Rpp32s *srcLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, + Rpp32f cutOffDB, + Rpp32s windowLength, + Rpp32f referencePower, + Rpp32s resetInterval, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcDescPtr->dataType == RpptDataType::F32) + { + + return hip_exec_non_silent_region_detection_tensor(static_cast(srcPtr), + srcDescPtr, + srcLengthTensor, + detectedIndexTensor, + detectionLengthTensor, + cutOffDB, + windowLength, + referencePower, + resetInterval, + rpp::deref(rppHandle)); + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } + +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** to_decibels ********************/ + +RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptImagePatchPtr srcDims, + Rpp32f cutOffDB, + Rpp32f multiplier, + Rpp32f referenceMagnitude, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims + if (tensorDims != 1 && tensorDims != 2) + return RPP_ERROR_INVALID_SRC_DIMS; + + if (!multiplier) + return RPP_ERROR_ZERO_DIVISION; + + if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_to_decibels_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + srcDims, + cutOffDB, + multiplier, + referenceMagnitude, + rpp::deref(rppHandle)); + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** down_mixing ********************/ + +RppStatus rppt_down_mixing_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32s *srcDimsTensor, + bool normalizeWeights, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims + if (tensorDims != 1 && tensorDims != 2) + return RPP_ERROR_INVALID_SRC_DIMS; + + if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_down_mixing_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + srcDimsTensor, + normalizeWeights, + rpp::deref(rppHandle)); + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +#endif // GPU_SUPPORT +#endif // AUDIO_SUPPORT \ No newline at end of file diff --git a/src/modules/rppt_tensor_color_augmentations.cpp b/src/modules/rppt_tensor_color_augmentations.cpp index 3023973fc..e866fe949 100644 --- a/src/modules/rppt_tensor_color_augmentations.cpp +++ b/src/modules/rppt_tensor_color_augmentations.cpp @@ -677,7 +677,7 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, - Rpp8s *adjustmentValueTensor, + Rpp32s *adjustmentValueTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle) diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp index 8829a4ee0..8fc2d00ee 100644 --- a/src/modules/rppt_tensor_effects_augmentations.cpp +++ b/src/modules/rppt_tensor_effects_augmentations.cpp @@ -868,6 +868,142 @@ RppStatus rppt_ricap_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** glitch ********************/ + +RppStatus rppt_glitch_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + glitch_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + glitch_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + glitch_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + glitch_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** jitter ********************/ + +RppStatus rppt_jitter_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + Rpp32u seed, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + RpptXorwowStateBoxMuller xorwowInitialState[SIMD_FLOAT_VECTOR_LENGTH]; + rpp_host_rng_xorwow_f32_initialize_multiseed_stream_boxmuller(xorwowInitialState, seed); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + jitter_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + jitter_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + jitter_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + jitter_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -1441,6 +1577,8 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, #endif // backend } +/******************** water ********************/ + RppStatus rppt_water_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, @@ -1511,80 +1649,6 @@ RppStatus rppt_water_gpu(RppPtr_t srcPtr, #endif // backend } -/******************** ricap ********************/ - -RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, - RpptDescPtr srcDescPtr, - RppPtr_t dstPtr, - RpptDescPtr dstDescPtr, - Rpp32u *permutationTensor, - RpptROIPtr roiPtrInputCropRegion, - RpptRoiType roiType, - rppHandle_t rppHandle) -{ -#ifdef HIP_COMPILE - if(srcDescPtr->n == 1) // BatchSize should always be greater than 1 - return RPP_ERROR; - Rpp32u *permutationHipTensor = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); - CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice)); - - if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1) - || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1) - || (check_roi_out_of_bounds(&roiPtrInputCropRegion[2],srcDescPtr,roiType) == -1) - || (check_roi_out_of_bounds(&roiPtrInputCropRegion[3],srcDescPtr,roiType) == -1)) - return RPP_ERROR_OUT_OF_BOUND_SRC_ROI; - - if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) - { - hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, - srcDescPtr, - static_cast(dstPtr) + dstDescPtr->offsetInBytes, - dstDescPtr, - permutationHipTensor, - roiPtrInputCropRegion, - roiType, - rpp::deref(rppHandle)); - } - else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) - { - hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), - srcDescPtr, - (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), - dstDescPtr, - permutationHipTensor, - roiPtrInputCropRegion, - roiType, - rpp::deref(rppHandle)); - } - else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) - { - hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), - srcDescPtr, - (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), - dstDescPtr, - permutationHipTensor, - roiPtrInputCropRegion, - roiType, - rpp::deref(rppHandle)); - } - else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) - { - hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, - srcDescPtr, - static_cast(dstPtr) + dstDescPtr->offsetInBytes, - dstDescPtr, - permutationHipTensor, - roiPtrInputCropRegion, - roiType, - rpp::deref(rppHandle)); - } - - return RPP_SUCCESS; -#elif defined(OCL_COMPILE) - return RPP_ERROR_NOT_IMPLEMENTED; -#endif // backend -} - /******************** vignette ********************/ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, @@ -1649,6 +1713,8 @@ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** erase ********************/ + RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, @@ -1721,4 +1787,224 @@ RppStatus rppt_erase_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** ricap ********************/ + +RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *permutationTensor, + RpptROIPtr roiPtrInputCropRegion, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if(srcDescPtr->n == 1) // BatchSize should always be greater than 1 + return RPP_ERROR; + Rpp32u *permutationHipTensor = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + CHECK_RETURN_STATUS(hipMemcpy(permutationHipTensor, permutationTensor, sizeof(Rpp32u)* 4 * dstDescPtr->n, hipMemcpyHostToDevice)); + + if ((check_roi_out_of_bounds(&roiPtrInputCropRegion[0],srcDescPtr,roiType) == -1) + || (check_roi_out_of_bounds(&roiPtrInputCropRegion[1],srcDescPtr,roiType) == -1) + || (check_roi_out_of_bounds(&roiPtrInputCropRegion[2],srcDescPtr,roiType) == -1) + || (check_roi_out_of_bounds(&roiPtrInputCropRegion[3],srcDescPtr,roiType) == -1)) + return RPP_ERROR_OUT_OF_BOUND_SRC_ROI; + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + permutationHipTensor, + roiPtrInputCropRegion, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + permutationHipTensor, + roiPtrInputCropRegion, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_ricap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + permutationHipTensor, + roiPtrInputCropRegion, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_ricap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + permutationHipTensor, + roiPtrInputCropRegion, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** glitch ********************/ + +RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + RpptChannelOffsets *rgbOffsets, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_glitch_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_glitch_tensor(reinterpret_cast((static_cast(srcPtr) + srcDescPtr->offsetInBytes)), + srcDescPtr, + reinterpret_cast((static_cast(dstPtr) + dstDescPtr->offsetInBytes)), + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_glitch_tensor(reinterpret_cast((static_cast(srcPtr) + srcDescPtr->offsetInBytes)), + srcDescPtr, + reinterpret_cast((static_cast(dstPtr) + dstDescPtr->offsetInBytes)), + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_glitch_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rgbOffsets, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** jitter ********************/ + +RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + Rpp32u seed, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + + RpptXorwowStateBoxMuller xorwowInitialState; + xorwowInitialState.x[0] = 0x75BCD15 + seed; + xorwowInitialState.x[1] = 0x159A55E5 + seed; + xorwowInitialState.x[2] = 0x1F123BB5 + seed; + xorwowInitialState.x[3] = 0x5491333 + seed; + xorwowInitialState.x[4] = 0x583F19 + seed; + xorwowInitialState.counter = 0x64F0C9 + seed; + xorwowInitialState.boxMullerFlag = 0; + xorwowInitialState.boxMullerExtra = 0.0f; + + RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr; + d_xorwowInitialStatePtr = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice)); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_jitter_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_jitter_tensor((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/src/modules/rppt_tensor_geometric_augmentations.cpp b/src/modules/rppt_tensor_geometric_augmentations.cpp index 6d573ffcc..325881c54 100644 --- a/src/modules/rppt_tensor_geometric_augmentations.cpp +++ b/src/modules/rppt_tensor_geometric_augmentations.cpp @@ -1099,7 +1099,6 @@ RppStatus rppt_slice_host(RppPtr_t srcPtr, layoutParams, rpp::deref(rppHandle)); } - return RPP_SUCCESS; } @@ -1301,6 +1300,145 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** lens_correction ********************/ + +RppStatus rppt_lens_correction_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *rowRemapTable, + Rpp32f *colRemapTable, + RpptDescPtr tableDescPtr, + Rpp32f *cameraMatrixTensor, + Rpp32f *distortionCoeffsTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + compute_lens_correction_remap_tables_host_tensor(srcDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + cameraMatrixTensor, + distortionCoeffsTensor, + roiTensorPtrSrc, + rpp::deref(rppHandle)); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + remap_bilinear_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + remap_bilinear_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + remap_bilinear_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + remap_bilinear_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + +/******************** transpose ********************/ + +RppStatus rppt_transpose_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *permTensor, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) + { + transpose_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + transpose_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + transpose_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) + { + transpose_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -2170,6 +2308,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, } /******************** remap ********************/ + RppStatus rppt_remap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, @@ -2249,4 +2388,150 @@ RppStatus rppt_remap_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** lens_correction ********************/ + +RppStatus rppt_lens_correction_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *rowRemapTable, + Rpp32f *colRemapTable, + RpptDescPtr tableDescPtr, + Rpp32f *cameraMatrixTensor, + Rpp32f *distortionCoeffsTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + hip_exec_lens_correction_tensor(dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + cameraMatrixTensor, + distortionCoeffsTensor, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_remap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + RpptInterpolationType::BILINEAR, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_remap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + RpptInterpolationType::BILINEAR, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_remap_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + RpptInterpolationType::BILINEAR, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_remap_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + rowRemapTable, + colRemapTable, + tableDescPtr, + RpptInterpolationType::BILINEAR, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** transpose ********************/ + +RppStatus rppt_transpose_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *permTensor, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_transpose_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_transpose_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_transpose_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_transpose_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes, + dstGenericDescPtr, + permTensor, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT \ No newline at end of file diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt index 77052cabe..23515798b 100644 --- a/utilities/test_suite/CMakeLists.txt +++ b/utilities/test_suite/CMakeLists.txt @@ -51,6 +51,7 @@ endif() if(NOT RPP_FOUND) message("-- ${Yellow}${PROJECT_NAME} requires RPP. Install RPP before running CTests") else() + # RPP installation - Backend check set(RPP_BACKEND_HIP_FOUND 0) if(EXISTS ${RPP_INCLUDE_DIR}/rpp_backend.h) file(READ ${RPP_INCLUDE_DIR}/rpp_backend.h RPP_BACKEND_FILE) @@ -62,6 +63,14 @@ else() elseif(NOT DEFINED BACKEND) set(BACKEND "CPU") endif() + + # RPP installation - Audio support check + set(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND 0) + if(EXISTS ${RPP_INCLUDE_DIR}/rpp_audio_augmentations_support.h) + file(READ ${RPP_INCLUDE_DIR}/rpp_audio_augmentations_support.h RPP_AUDIO_AUGMENTATIONS_SUPPORT_FILE) + string(REGEX MATCH "RPP_AUDIO_AUGMENTATIONS_SUPPORT ([0-9]*)" _ ${RPP_AUDIO_AUGMENTATIONS_SUPPORT_FILE}) + set(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND ${CMAKE_MATCH_1}) + endif() endif(NOT RPP_FOUND) # find required libraries @@ -87,13 +96,15 @@ if(Python3_FOUND) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) - if(libsnd_LIBS) - add_test( - NAME rpp_qa_tests_tensor_audio_host_all - COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runAudioTests.py --qa_mode 1 --batch_size 3 - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - ) - endif(libsnd_LIBS) + if(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND) + if(libsnd_LIBS) + add_test( + NAME rpp_qa_tests_tensor_audio_host_all + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HOST/runAudioTests.py --qa_mode 1 --batch_size 3 + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + endif(libsnd_LIBS) + endif(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND) if( "${BACKEND}" STREQUAL "HIP") # Running all HIP tests @@ -109,6 +120,15 @@ if(Python3_FOUND) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) + if(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND) + if(libsnd_LIBS) + add_test( + NAME rpp_qa_tests_tensor_audio_hip_all + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runAudioTests.py --qa_mode 1 --batch_size 3 + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + endif(libsnd_LIBS) + endif(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND) elseif( "${BACKEND}" STREQUAL "OCL") # TBD: Add OCL Tests diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt index a0bd42fa0..814b006fb 100644 --- a/utilities/test_suite/HIP/CMakeLists.txt +++ b/utilities/test_suite/HIP/CMakeLists.txt @@ -58,12 +58,25 @@ find_package(hip QUIET) find_package(OpenCV QUIET) find_package(TurboJpeg QUIET) find_package(NIFTI QUIET) +find_library(libsnd_LIBS + NAMES sndfile libsndfile + PATHS ${CMAKE_SYSTEM_PREFIX_PATH} ${LIBSND_ROOT_DIR} "/usr/local" + PATH_SUFFIXES lib lib64) # OpenMP find_package(OpenMP REQUIRED) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +# Audio Support +option(RPP_AUDIO_SUPPORT "Build RPP test suite with audio support" ON) +if(RPP_AUDIO_SUPPORT) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAUDIO_SUPPORT=1") + message("-- ${Green}RPP_AUDIO_SUPPORT set to ON ${ColourReset}") +else() + message("-- ${Yellow}RPP_AUDIO_SUPPORT (default=ON) was overrided. Skipping audio tests.${ColourReset}") +endif() + if(TurboJpeg_FOUND) message("-- ${Green}${PROJECT_NAME} set to build with rpp and TurboJpeg${ColourReset}") include_directories(${TurboJpeg_INCLUDE_DIRS}) @@ -102,4 +115,25 @@ if(NIFTI_FOUND AND OpenCV_FOUND) target_link_libraries(Tensor_voxel_hip ${OpenCV_LIBS} -lturbojpeg -lrpp ${hip_LIBRARIES} pthread ${LINK_LIBRARY_LIST} hip::device ${NIFTI_PACKAGE_PREFIX}NIFTI::${NIFTI_PACKAGE_PREFIX}niftiio) else() message("-- ${Yellow}Warning: libniftiio must be installed to install ${PROJECT_NAME}/Tensor_voxel_hip successfully!${ColourReset}") -endif() \ No newline at end of file +endif() + +if(RPP_AUDIO_SUPPORT) + if(NOT libsnd_LIBS) + message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_hip successfully!${ColourReset}") + else() + message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}") + set(COMPILER_FOR_HIP ${ROCM_PATH}/bin/hipcc) + set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP}) + include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include) + link_directories(${ROCM_PATH}/lib /usr/local/lib) + include_directories(${SndFile_INCLUDE_DIRS}) + link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/) + + add_executable(Tensor_audio_hip Tensor_audio_hip.cpp) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") + if(NOT APPLE) + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs) + endif() + target_link_libraries(Tensor_audio_hip ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST}) + endif() +endif() diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp new file mode 100644 index 000000000..9d47d8a2c --- /dev/null +++ b/utilities/test_suite/HIP/Tensor_audio_hip.cpp @@ -0,0 +1,299 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "../rpp_test_suite_audio.h" + +int main(int argc, char **argv) +{ + // handle inputs + const int MIN_ARG_COUNT = 7; + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_audio_hip \n"); + return -1; + } + + char *src = argv[1]; + int testCase = atoi(argv[2]); + int testType = atoi(argv[3]); + int numRuns = atoi(argv[4]); + int batchSize = atoi(argv[5]); + char *dst = argv[6]; + string scriptPath = argv[7]; + + // validation checks + if (testType == 0 && batchSize != 3) + { + cout << "Error! QA Mode only runs with batchsize 3" << endl; + return -1; + } + + // set case names + string funcName = audioAugmentationMap[testCase]; + if (funcName.empty()) + { + if (testType == 0) + printf("\ncase %d is not supported\n", testCase); + + return -1; + } + + // initialize tensor descriptors + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // set src/dst data types in tensor descriptors + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + + // other initializations + int missingFuncFlag = 0; + int maxSrcChannels = 0; + int maxSrcWidth = 0, maxSrcHeight = 0; + int maxDstWidth = 0, maxDstHeight = 0; + Rpp64u iBufferSize = 0; + Rpp64u oBufferSize = 0; + static int noOfAudioFiles = 0; + + // string ops on function name + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + string func = funcName; + + // get number of audio files + vector audioNames, audioFilesPath; + search_files_recursive(src, audioNames, audioFilesPath, ".wav"); + noOfAudioFiles = audioNames.size(); + if (noOfAudioFiles < batchSize || ((noOfAudioFiles % batchSize) != 0)) + { + replicate_last_file_to_fill_batch(audioFilesPath[noOfAudioFiles - 1], audioFilesPath, audioNames, audioNames[noOfAudioFiles - 1], noOfAudioFiles, batchSize); + noOfAudioFiles = audioNames.size(); + } + + // find max audio dimensions in the input dataset + maxSrcHeight = 1; + maxDstHeight = 1; + set_audio_max_dimensions(audioFilesPath, maxSrcWidth, maxSrcChannels); + maxDstWidth = maxSrcWidth; + + // set numDims, offset, n/c/h/w values for src/dst + Rpp32u offsetInBytes = 0; + set_audio_descriptor_dims_and_strides(srcDescPtr, batchSize, maxSrcHeight, maxSrcWidth, maxSrcChannels, offsetInBytes); + int maxDstChannels = maxSrcChannels; + if(testCase == 3) + { + srcDescPtr->numDims = 3; + maxDstChannels = 1; + } + set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes); + + // set buffer sizes for src/dst + iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; + oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; + + // allocate hip buffers for input & output + Rpp32f *inputf32 = (Rpp32f *)calloc(iBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + void *d_inputf32, *d_outputf32; + CHECK_RETURN_STATUS(hipMalloc(&d_inputf32, iBufferSize * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f))); + + // allocate the buffers for audio length and channels + Rpp32s *srcLengthTensor, *channelsTensor; + CHECK_RETURN_STATUS(hipHostMalloc(&srcLengthTensor, batchSize * sizeof(Rpp32s))); + CHECK_RETURN_STATUS(hipHostMalloc(&channelsTensor, batchSize * sizeof(Rpp32s))); + + // allocate the buffers for src/dst dimensions for each element in batch + RpptImagePatch *srcDims, *dstDims; + CHECK_RETURN_STATUS(hipHostMalloc(&srcDims, batchSize * sizeof(RpptImagePatch))); + CHECK_RETURN_STATUS(hipHostMalloc(&dstDims, batchSize * sizeof(RpptImagePatch))); + + // allocate the buffer for srcDimsTensor + Rpp32s *srcDimsTensor; + if(testCase == 3) + CHECK_RETURN_STATUS(hipHostMalloc(&srcDimsTensor, batchSize * 2 * sizeof(Rpp32s))); + + Rpp32s *detectedIndex = nullptr, *detectionLength = nullptr; + if(testCase == 0) + { + CHECK_RETURN_STATUS(hipHostMalloc(&detectedIndex, batchSize * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipHostMalloc(&detectionLength, batchSize * sizeof(Rpp32f))); + } + + // run case-wise RPP API and measure time + rppHandle_t handle; + hipStream_t stream; + CHECK_RETURN_STATUS(hipStreamCreate(&stream)); + rppCreateWithStreamAndBatchSize(&handle, stream, batchSize); + + int noOfIterations = (int)audioNames.size() / batchSize; + double maxWallTime = 0, minWallTime = 500, avgWallTime = 0; + string testCaseName; + printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); + for (int iterCount = 0; iterCount < noOfIterations; iterCount++) + { + // read and decode audio and fill the audio dim values + read_audio_batch_and_fill_dims(srcDescPtr, inputf32, audioFilesPath, iterCount, srcLengthTensor, channelsTensor); + CHECK_RETURN_STATUS(hipMemcpy(d_inputf32, inputf32, iBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice)); + for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++) + { + double startWallTime, endWallTime; + double wallTime; + switch (testCase) + { + case 0: + { + testCaseName = "non_silent_region_detection"; + Rpp32f cutOffDB = -60.0; + Rpp32s windowLength = 2048; + Rpp32f referencePower = 0.0f; + Rpp32s resetInterval = 8192; + + startWallTime = omp_get_wtime(); + rppt_non_silent_region_detection_gpu(d_inputf32, srcDescPtr, srcLengthTensor, detectedIndex, detectionLength, cutOffDB, windowLength, referencePower, resetInterval, handle); + + break; + } + case 1: + { + testCaseName = "to_decibels"; + Rpp32f cutOffDB = std::log(1e-20); + Rpp32f multiplier = std::log(10); + Rpp32f referenceMagnitude = 1.0f; + + for (int i = 0; i < batchSize; i++) + { + srcDims[i].height = dstDims[i].height = srcLengthTensor[i]; + srcDims[i].width = dstDims[i].width = 1; + } + + startWallTime = omp_get_wtime(); + rppt_to_decibels_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, srcDims, cutOffDB, multiplier, referenceMagnitude, handle); + + break; + } + case 3: + { + testCaseName = "down_mixing"; + bool normalizeWeights = false; + + for (int i = 0, j = 0; i < batchSize; i++, j += 2) + { + srcDimsTensor[j] = srcLengthTensor[i]; + srcDimsTensor[j + 1] = channelsTensor[i]; + dstDims[i].height = srcLengthTensor[i]; + dstDims[i].width = 1; + } + + startWallTime = omp_get_wtime(); + rppt_down_mixing_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, srcDimsTensor, normalizeWeights, handle); + + break; + } + default: + { + missingFuncFlag = 1; + break; + } + } + CHECK_RETURN_STATUS(hipDeviceSynchronize()); + + endWallTime = omp_get_wtime(); + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func.c_str()); + return -1; + } + + wallTime = endWallTime - startWallTime; + maxWallTime = std::max(maxWallTime, wallTime); + minWallTime = std::min(minWallTime, wallTime); + avgWallTime += wallTime; + } + + // QA mode - verify outputs with golden outputs. Below code doesn’t run for performance tests + if (testType == 0) + { + CHECK_RETURN_STATUS(hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost)); + CHECK_RETURN_STATUS(hipDeviceSynchronize()); + + /* Run only if testCase is not 0 + For testCase 0 verify_non_silent_region_detection function is used for QA testing */ + if (testCase != 0) + verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HIP"); + else + verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst); + + /* Dump the outputs to csv files for debugging + Runs only if + 1. DEBUG_MODE is enabled + 2. Current iteration is 1st iteration + 3. Test case is not 0 */ + if (DEBUG_MODE && iterCount == 0 && testCase != 0) + { + std::ofstream refFile; + refFile.open(func + ".csv"); + for (int i = 0; i < oBufferSize; i++) + refFile << *(outputf32 + i) << "\n"; + refFile.close(); + } + } + } + rppDestroyGPU(handle); + + // performance test mode + if (testType == 1) + { + // display measured times + maxWallTime *= 1000; + minWallTime *= 1000; + avgWallTime *= 1000; + avgWallTime /= (numRuns * noOfIterations); + cout << fixed << "\nmax,min,avg wall times in ms/batch = " << maxWallTime << "," << minWallTime << "," << avgWallTime; + } + + cout << endl; + + // free memory + free(inputf32); + free(outputf32); + CHECK_RETURN_STATUS(hipFree(d_inputf32)); + CHECK_RETURN_STATUS(hipFree(d_outputf32)); + CHECK_RETURN_STATUS(hipHostFree(srcLengthTensor)); + CHECK_RETURN_STATUS(hipHostFree(channelsTensor)); + CHECK_RETURN_STATUS(hipHostFree(srcDims)); + CHECK_RETURN_STATUS(hipHostFree(dstDims)); + if(testCase == 3) + CHECK_RETURN_STATUS(hipHostFree(srcDimsTensor)); + if (detectedIndex != nullptr) + CHECK_RETURN_STATUS(hipHostFree(detectedIndex)); + if (detectionLength != nullptr) + CHECK_RETURN_STATUS(hipHostFree(detectionLength)); + return 0; +} diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 0d8b7fd7c..ec1b47d9b 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -66,7 +66,8 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); - bool randomOutputCase = (testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54); + bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54); + bool nonQACase = (testCase == 24); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); bool noiseTypeCase = (testCase == 8); @@ -104,7 +105,7 @@ int main(int argc, char **argv) if (layoutType == 2) { - if(testCase == 36 || testCase == 31 || testCase == 45 || testCase == 86) + if(testCase == 36 || testCase == 31 || testCase == 35 || testCase == 45 || testCase == 86) { printf("\ncase %d does not exist for PLN1 layout\n", testCase); return -1; @@ -366,10 +367,19 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostMalloc(&roiPtrInputCropRegion, 4 * sizeof(RpptROI))); void *d_rowRemapTable, *d_colRemapTable; - if(testCase == 79) + if(testCase == 26 || testCase == 79) { CHECK_RETURN_STATUS(hipMalloc(&d_rowRemapTable, ioBufferSize * sizeof(Rpp32u))); CHECK_RETURN_STATUS(hipMalloc(&d_colRemapTable, ioBufferSize * sizeof(Rpp32u))); + CHECK_RETURN_STATUS(hipMemset(d_rowRemapTable, 0, ioBufferSize * sizeof(Rpp32u))); + CHECK_RETURN_STATUS(hipMemset(d_colRemapTable, 0, ioBufferSize * sizeof(Rpp32u))); + } + + Rpp32f *cameraMatrix, *distortionCoeffs; + if(testCase == 26) + { + CHECK_RETURN_STATUS(hipHostMalloc(&cameraMatrix, batchSize * 9 * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipHostMalloc(&distortionCoeffs, batchSize * 8 * sizeof(Rpp32f))); } Rpp32u boxesInEachImage = 3; @@ -397,6 +407,14 @@ int main(int argc, char **argv) if(testCase == 46) CHECK_RETURN_STATUS(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f))); + Rpp32u *kernelSizeTensor; + if(testCase == 6) + CHECK_RETURN_STATUS(hipHostMalloc(&kernelSizeTensor, batchSize * sizeof(Rpp32u))); + + RpptChannelOffsets *rgbOffsets; + if(testCase == 35) + CHECK_RETURN_STATUS(hipHostMalloc(&rgbOffsets, batchSize * sizeof(RpptChannelOffsets))); + // case-wise RPP API and measure time script for Unit and Performance test printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); for(int iterCount = 0; iterCount < noOfIterations; iterCount++) @@ -548,6 +566,22 @@ int main(int argc, char **argv) break; } + case 6: + { + testCaseName = "jitter"; + + Rpp32u seed = 1255459; + for (i = 0; i < batchSize; i++) + kernelSizeTensor[i] = 5; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_jitter_gpu(d_input, srcDescPtr, d_output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 8: { testCaseName = "noise"; @@ -696,6 +730,52 @@ int main(int argc, char **argv) break; } + case 24: + { + testCaseName = "warp_affine"; + + if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR)) + { + missingFuncFlag = 1; + break; + } + + Rpp32f6 affineTensor_f6[batchSize]; + Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6; + for (i = 0; i < batchSize; i++) + { + affineTensor_f6[i].data[0] = 1.23; + affineTensor_f6[i].data[1] = 0.5; + affineTensor_f6[i].data[2] = 0; + affineTensor_f6[i].data[3] = -0.8; + affineTensor_f6[i].data[4] = 0.83; + affineTensor_f6[i].data[5] = 0; + } + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_warp_affine_gpu(d_input, srcDescPtr, d_output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } + case 26: + { + testCaseName = "lens_correction"; + + RpptDesc tableDesc = srcDesc; + RpptDescPtr tableDescPtr = &tableDesc; + init_lens_correction(batchSize, srcDescPtr, cameraMatrix, distortionCoeffs, tableDescPtr); + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_lens_correction_gpu(d_input, srcDescPtr, d_output, dstDescPtr, static_cast(d_rowRemapTable), static_cast(d_colRemapTable), tableDescPtr, cameraMatrix, distortionCoeffs, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 29: { testCaseName = "water"; @@ -836,6 +916,28 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostFree(lutBuffer)); } + case 35: + { + testCaseName = "glitch"; + + for (i = 0; i < batchSize; i++) + { + rgbOffsets[i].r.x = 10; + rgbOffsets[i].r.y = 10; + rgbOffsets[i].g.x = 0; + rgbOffsets[i].g.y = 0; + rgbOffsets[i].b.x = 5; + rgbOffsets[i].b.y = 5; + } + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_glitch_gpu(d_input, srcDescPtr, d_output, dstDescPtr, rgbOffsets, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 36: { testCaseName = "color_twist"; @@ -1397,7 +1499,7 @@ int main(int argc, char **argv) 1.QA Flag is set 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ - if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) + if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase)) { if (testCase == 87) compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); @@ -1465,7 +1567,7 @@ int main(int argc, char **argv) 2.input bit depth 0 (Input U8 && Output U8) 3.source and destination layout are the same 4.augmentation case does not generate random output*/ - if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase)) + if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase)) compare_output(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath); // Calculate exact dstROI in XYWH format for OpenCV dump @@ -1520,6 +1622,20 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostFree(cropRoi)); CHECK_RETURN_STATUS(hipHostFree(patchRoi)); } + if(testCase == 26) + { + CHECK_RETURN_STATUS(hipHostFree(cameraMatrix)); + CHECK_RETURN_STATUS(hipHostFree(distortionCoeffs)); + } + if(testCase == 79) + { + free(rowRemapTable); + free(colRemapTable); + CHECK_RETURN_STATUS(hipFree(d_rowRemapTable)); + CHECK_RETURN_STATUS(hipFree(d_colRemapTable)); + } + if(testCase == 35) + CHECK_RETURN_STATUS(hipHostFree(rgbOffsets)); if (reductionTypeCase) { CHECK_RETURN_STATUS(hipHostFree(reductionFuncResultArr)); @@ -1538,19 +1654,14 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostFree(shapeTensor)); if(roiTensor != NULL) CHECK_RETURN_STATUS(hipHostFree(roiTensor)); + if(testCase == 6) + CHECK_RETURN_STATUS(hipHostFree(kernelSizeTensor)); free(input); free(input_second); free(output); free(inputu8); free(inputu8Second); free(outputu8); - if(testCase == 79) - { - free(rowRemapTable); - free(colRemapTable); - CHECK_RETURN_STATUS(hipFree(d_rowRemapTable)); - CHECK_RETURN_STATUS(hipFree(d_colRemapTable)); - } CHECK_RETURN_STATUS(hipFree(d_input)); if(dualInputCase) CHECK_RETURN_STATUS(hipFree(d_input_second)); diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp index 96197f432..cb0d53b34 100644 --- a/utilities/test_suite/HIP/Tensor_misc_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp @@ -31,7 +31,7 @@ int main(int argc, char **argv) if (argc < MIN_ARG_COUNT) { printf("\nImproper Usage! Needs all arguments!\n"); - printf("\nUsage: ./Tensor_misc_hip