diff --git a/.Doxyfile b/.Doxyfile
index 066a53c02..dac8a3acc 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -960,16 +960,16 @@ INPUT = README.md \
include/rppi_logical_operations.h \
include/rppi_morphological_transforms.h \
include/rppi_statistical_operations.h \
+ include/rppt_tensor_arithmetic_operations.h \
+ include/rppt_tensor_audio_augmentations.h \
include/rppt_tensor_color_augmentations.h \
include/rppt_tensor_data_exchange_operations.h \
include/rppt_tensor_effects_augmentations.h \
include/rppt_tensor_filter_augmentations.h \
include/rppt_tensor_geometric_augmentations.h \
+ include/rppt_tensor_logical_operations.h \
include/rppt_tensor_morphological_operations.h \
- include/rppt_tensor_statistical_operations.h \
- include/rppt_tensor_arithmetic_operations.h \
- include/rppt_tensor_audio_augmentations.h \
- include/rppt_tensor_logical_operations.h
+ include/rppt_tensor_statistical_operations.h
# This tag can be used to specify the character encoding of the source files
@@ -2381,7 +2381,7 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE
+PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png
new file mode 100644
index 000000000..8aef1cbe6
Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png differ
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index 18d9a73bc..9773637df 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -962,14 +962,16 @@ INPUT = ../../README.md \
../../include/rppi_logical_operations.h \
../../include/rppi_morphological_transforms.h \
../../include/rppi_statistical_operations.h \
+ ../../include/rppt_tensor_arithmetic_operations.h \
+ ../../include/rppt_tensor_audio_augmentations.h \
../../include/rppt_tensor_color_augmentations.h \
../../include/rppt_tensor_data_exchange_operations.h \
../../include/rppt_tensor_effects_augmentations.h \
../../include/rppt_tensor_filter_augmentations.h \
../../include/rppt_tensor_geometric_augmentations.h \
+ ../../include/rppt_tensor_logical_operations.h \
../../include/rppt_tensor_morphological_operations.h \
- ../../include/rppt_tensor_statistical_operations.h \
- ../../include/rppt_tensor_logical_operations.h
+ ../../include/rppt_tensor_statistical_operations.h
# This tag can be used to specify the character encoding of the source files
@@ -2381,7 +2383,7 @@ INCLUDE_FILE_PATTERNS =
# recursively expanded use the := operator instead of the = operator.
# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE
+PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
# tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index a88668ba5..c316de276 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1 @@
-rocm-docs-core[api_reference]==1.5.0
+rocm-docs-core[api_reference]==1.5.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index 54fbfde32..2c9286b18 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -110,7 +110,7 @@ requests==2.28.2
# via
# pygithub
# sphinx
-rocm-docs-core[api-reference]==1.5.0
+rocm-docs-core[api-reference]==1.5.1
# via -r requirements.in
smmap==5.0.0
# via gitdb
diff --git a/include/rppdefs.h b/include/rppdefs.h
index e863ed6ad..6eb025665 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -64,6 +64,7 @@ SOFTWARE.
const float ONE_OVER_6 = 1.0f / 6;
const float ONE_OVER_3 = 1.0f / 3;
const float ONE_OVER_255 = 1.0f / 255;
+const uint MMS_MAX_SCRATCH_MEMORY = 76800000; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training
/******************** RPP typedefs ********************/
@@ -137,8 +138,14 @@ typedef enum
RPP_ERROR_LAYOUT_MISMATCH = -18,
/*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
RPP_ERROR_INVALID_CHANNELS = -19,
+ /*! \brief Invalid output tile length (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH = -20,
+ /*! \brief Shared memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE = -21,
+ /*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */
+ RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE = -22,
/*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */
- RPP_ERROR_INVALID_SRC_DIMS = -20
+ RPP_ERROR_INVALID_SRC_DIMS = -23
} RppStatus;
/*! \brief RPP rppStatus_t type enums
diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index 4ffd24156..88a8d76a2 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -190,7 +190,7 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
-RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
+RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Multiply scalar augmentation on HIP backend
@@ -226,7 +226,7 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -248,7 +248,7 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h
index 71b7282e4..0a693baa9 100644
--- a/include/rppt_tensor_audio_augmentations.h
+++ b/include/rppt_tensor_audio_augmentations.h
@@ -64,6 +64,28 @@ extern "C" {
*/
RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+#ifdef GPU_SUPPORT
+/*! \brief Non Silent Region Detection augmentation on HIP backend
+ * \details Non Silent Region Detection augmentation for 1D audio buffer
+ \n Finds the starting index and length of non silent region in the audio buffer by comparing the
+ calculated short-term power with cutoff value passed
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+ * \param [in] srcLengthTensor source audio buffer length (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [out] detectionLengthTensor length of non silent region (1D tensor in Pinned/HIP memory, of size batchSize)
+ * \param [in] cutOffDB cutOff in dB below which the signal is considered silent
+ * \param [in] windowLength window length used for computing short-term power of the signal
+ * \param [in] referencePower reference power that is used to convert the signal to dB
+ * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief To Decibels augmentation on HOST backend
* \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values
* \param [in] srcPtr source tensor in HOST memory
@@ -174,15 +196,15 @@ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
/*! \brief Resample augmentation on HOST backend
* \details Resample augmentation for audio data
-* \param[in] srcPtr source tensor in HOST memory
-* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[out] dstPtr destination tensor in HOST memory
-* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
-* \param[in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize)
-* \param[in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize)
-* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
-* \param[in] window Resampling window (struct of type RpptRpptResamplingWindow)
-* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+* \param [in] srcPtr source tensor in HOST memory
+* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param [out] dstPtr destination tensor in HOST memory
+* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32)
+* \param [in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize)
+* \param [in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize)
+* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2)
+* \param [in] window Resampling window (struct of type RpptRpptResamplingWindow)
+* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h
index 3b39448eb..62ef13715 100644
--- a/include/rppt_tensor_color_augmentations.h
+++ b/include/rppt_tensor_color_augmentations.h
@@ -54,7 +54,7 @@ extern "C" {
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch)
* \param [in] betaTensor beta values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -76,7 +76,7 @@ RppStatus rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch)
* \param [in] betaTensor beta values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -97,7 +97,7 @@ RppStatus rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in HOST memory, of size batchSize with gamma >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -118,7 +118,7 @@ RppStatus rppt_gamma_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rp
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in pinned/HOST memory, of size batchSize with gamma >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -141,7 +141,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for alpha-blending (1D tensor in HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -164,7 +164,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] alphaTensor alpha values for alpha-blending (1D tensor in pinned/HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -188,7 +188,7 @@ RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc
* \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
* \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
* \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -212,7 +212,7 @@ RppStatus rppt_color_twist_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
* \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
* \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
* \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -236,7 +236,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] contrastTensor contrast modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch)
* \param [in] hueTensor hue modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch)
* \param [in] saturationTensor saturation modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -257,7 +257,7 @@ RppStatus rppt_color_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch)
* \param [in] alphaTensor alpha values for color casting calculation (1D tensor in HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -279,7 +279,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in pinned/HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch)
* \param [in] alphaTensor alpha values for color casting calculation (1D tensor in pinned/HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -300,7 +300,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -321,7 +321,7 @@ RppStatus rppt_exposure_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in pinned/HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -343,7 +343,7 @@ RppStatus rppt_exposure_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch))
* \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -365,7 +365,7 @@ RppStatus rppt_contrast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch))
* \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -386,7 +386,7 @@ RppStatus rppt_contrast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] lutPtr lut Array in HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -407,7 +407,7 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] lutPtr lut Array in pinned/HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -428,7 +428,7 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr,
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -449,7 +449,7 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h
index 8b62d61f5..b185e0081 100644
--- a/include/rppt_tensor_effects_augmentations.h
+++ b/include/rppt_tensor_effects_augmentations.h
@@ -56,7 +56,7 @@ extern "C" {
* \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch)
* \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch)
* \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -80,7 +80,7 @@ RppStatus rppt_gridmask_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch)
* \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch)
* \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -103,7 +103,7 @@ RppStatus rppt_gridmask_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -126,7 +126,7 @@ RppStatus rppt_spatter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -151,7 +151,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] saltValueTensor A user-defined salt noise value (1D tensor in HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch)
* \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -176,7 +176,7 @@ RppStatus rppt_salt_and_pepper_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt
* \param [in] saltValueTensor A user-defined salt noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch)
* \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -198,7 +198,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -220,7 +220,7 @@ RppStatus rppt_shot_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in pinned/HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -243,7 +243,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -266,7 +266,7 @@ RppStatus rppt_gaussian_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP
* \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
* \param [in] seed A user-defined seed value (single Rpp32u value)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -289,7 +289,7 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -312,7 +312,7 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -338,7 +338,7 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDesc
* \param[in] freqYTensor freqY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
* \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -364,7 +364,7 @@ RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param[in] freqYTensor freqY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
* \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -433,7 +433,7 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -455,7 +455,7 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -465,6 +465,50 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d
RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *vignetteIntensityTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/******************** jitter ********************/
+
+/*! \brief Jitter augmentation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input
+ * \image html effects_augmentations_jitter_img150x150.png Sample Output
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Jitter augmentation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input
+ * \image html effects_augmentations_jitter_img150x150.png Sample Output
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param un[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! \brief Gaussian noise augmentation on HOST backend
* \details This function adds gaussian noise to a batch of 4D tensors.
* Support added for u8 -> u8, f32 -> f32 datatypes.
@@ -524,7 +568,7 @@ RppStatus rppt_gaussian_noise_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcD
- Erase-region anchor boxes on each image given by the user must not overlap
* \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr)
* \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -549,7 +593,7 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
- Erase-region anchor boxes on each image given by the user must not overlap
* \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr)
* \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -559,13 +603,6 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptRoiLtrb *anchorBoxInfoTensor, RppPtr_t colorsTensor, Rpp32u *numBoxesTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
-/*! @}
- */
-
-#ifdef __cplusplus
-}
-#endif
-
/*! \brief Glitch augmentation on HOST backend for a NCHW/NHWC layout tensor
* \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
@@ -578,7 +615,7 @@ RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A single set of 3 Rppi point values that applies to all images in the batch.
* For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height)
- * \param [in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -600,7 +637,7 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A 1D tensor in pinned/HOST memory contains single set of 3 Rppi point values that applies to all images in the batch.
* For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height)
- * \param [in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -609,4 +646,11 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
*/
RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+
+/*! @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
#endif // RPPT_TENSOR_EFFECTS_AUGMENTATIONS_H
diff --git a/include/rppt_tensor_filter_augmentations.h b/include/rppt_tensor_filter_augmentations.h
index 7ea8d00c6..992631c49 100644
--- a/include/rppt_tensor_filter_augmentations.h
+++ b/include/rppt_tensor_filter_augmentations.h
@@ -57,7 +57,7 @@ extern "C" {
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -83,7 +83,7 @@ RppStatus rppt_box_filter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] stdDevTensor stdDev values for gaussian calculation (1D tensor in pinned/HOST memory, of size batchSize, for each image in batch)
* \param [in] kernelSize kernel size for gaussian filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h
index 884127a71..6da067844 100644
--- a/include/rppt_tensor_geometric_augmentations.h
+++ b/include/rppt_tensor_geometric_augmentations.h
@@ -52,7 +52,7 @@ extern "C" {
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -72,7 +72,7 @@ RppStatus rppt_crop_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -95,7 +95,7 @@ RppStatus rppt_crop_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
* \param [in] offsetTensor offset values for normalization (1D tensor in HOST memory, of size batchSize, with offsetTensor[n] <= 0)
* \param [in] multiplierTensor multiplier values for normalization (1D tensor in HOST memory, of size batchSize, with multiplierTensor[n] > 0)
* \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -118,7 +118,7 @@ RppStatus rppt_crop_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt
* \param [in] offsetTensor offset values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with offsetTensor[n] <= 0)
* \param [in] multiplierTensor multiplier values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with multiplierTensor[n] > 0)
* \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -140,7 +140,7 @@ RppStatus rppt_crop_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in HOST memory, of size batchSize * 6 for each image in batch)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -162,7 +162,7 @@ RppStatus rppt_warp_affine_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in pinned/HOST memory, of size batchSize * 6 for each image in batch)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -184,7 +184,7 @@ RppStatus rppt_warp_affine_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -206,7 +206,7 @@ RppStatus rppt_flip_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1)
* \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -228,7 +228,7 @@ RppStatus rppt_flip_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -250,7 +250,7 @@ RppStatus rppt_resize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -275,7 +275,7 @@ RppStatus rppt_resize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images))
* \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -300,7 +300,7 @@ RppStatus rppt_resize_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDesc
* \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images))
* \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -323,7 +323,7 @@ RppStatus rppt_resize_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescP
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -346,7 +346,7 @@ RppStatus rppt_resize_crop_mirror_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr,
* \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType
* \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -368,7 +368,7 @@ RppStatus rppt_resize_crop_mirror_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -390,7 +390,7 @@ RppStatus rppt_rotate_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in pinned/HOST memory, of size batchSize)
* \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -412,7 +412,7 @@ RppStatus rppt_rotate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -434,7 +434,7 @@ RppStatus rppt_phase_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -598,7 +598,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc
* \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc)
* \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -623,7 +623,7 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
* \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc)
* \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
* \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -650,7 +650,7 @@ RppStatus rppt_remap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
* \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize)
* \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -677,7 +677,7 @@ RppStatus rppt_lens_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
* \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1)
* \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize)
* \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -728,4 +728,4 @@ RppStatus rppt_transpose_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_GEOMETRIC_AUGMENTATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_GEOMETRIC_AUGMENTATIONS_H
diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h
index 3a4685167..28dff69ce 100644
--- a/include/rppt_tensor_logical_operations.h
+++ b/include/rppt_tensor_logical_operations.h
@@ -54,7 +54,7 @@ extern "C" {
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -76,7 +76,7 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -98,7 +98,7 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -120,7 +120,7 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -136,4 +136,4 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
diff --git a/include/rppt_tensor_morphological_operations.h b/include/rppt_tensor_morphological_operations.h
index eb879af5c..126c4757a 100644
--- a/include/rppt_tensor_morphological_operations.h
+++ b/include/rppt_tensor_morphological_operations.h
@@ -57,7 +57,7 @@ extern "C" {
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -82,7 +82,7 @@ RppStatus rppt_erode_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -98,4 +98,4 @@ RppStatus rppt_dilate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H
diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index 441816ea3..ca464340b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -50,7 +50,7 @@ extern "C" {
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorSumArr destination array in HOST memory
* \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -68,7 +68,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorSumArr destination array in HIP memory
* \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -86,7 +86,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] minArr destination array in HOST memory
* \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -104,7 +104,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] minArr destination array in HIP memory
* \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -122,7 +122,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] maxArr destination array in HOST memory
* \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -140,7 +140,7 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] maxArr destination array in HIP memory
* \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -201,7 +201,7 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorMeanArr destination array in HOST memory
* \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -219,7 +219,7 @@ RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] tensorMeanArr destination array in HIP memory
* \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -238,7 +238,7 @@ RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
* \param [out] tensorStddevArr destination array in HOST memory
* \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
* \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
* \return A \ref RppStatus enumeration.
@@ -257,7 +257,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
* \param [out] tensorStddevArr destination array in HIP memory
* \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4)
* \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch)
- * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
@@ -273,4 +273,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 779f6f2d1..be8eaeeaa 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -6111,6 +6111,25 @@ inline void compute_separable_horizontal_resample(Rpp32f *inputPtr, T *outputPtr
}
}
+inline void compute_jitter_src_loc_avx(__m256i *pxXorwowStateX, __m256i *pxXorwowStateCounter, __m256 &pRow, __m256 &pCol, __m256 &pKernelSize, __m256 &pBound, __m256 &pHeightLimit, __m256 &pWidthLimit, __m256 &pStride, __m256 &pChannel, Rpp32s *srcLoc)
+{
+ __m256 pRngX = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter);
+ __m256 pRngY = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter);
+ __m256 pX = _mm256_mul_ps(pRngX, pKernelSize);
+ __m256 pY = _mm256_mul_ps(pRngY, pKernelSize);
+ pX = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pRow, _mm256_sub_ps(pX, pBound))), pHeightLimit), avx_p0);
+ pY = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pCol, _mm256_sub_ps(pY, pBound))), pWidthLimit), avx_p0);
+ __m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pX, pStride, _mm256_mul_ps(pY, pChannel)));
+ _mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc);
+}
+
+inline void compute_jitter_src_loc(RpptXorwowStateBoxMuller *xorwowState, Rpp32s row, Rpp32s col, Rpp32s kSize, Rpp32s heightLimit, Rpp32s widthLimit, Rpp32s stride, Rpp32s bound, Rpp32s channels, Rpp32s &loc)
+{
+ Rpp32u heightIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize;
+ Rpp32u widthIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize;
+ loc = std::max(std::min(static_cast(row + heightIncrement - bound), heightLimit), 0) * stride;
+ loc += std::max(std::min(static_cast(col + widthIncrement - bound), (widthLimit - 1)), 0) * channels;
+}
inline void compute_sum_16_host(__m256i *p, __m256i *pSum)
{
pSum[0] = _mm256_add_epi32(_mm256_add_epi32(p[0], p[1]), pSum[0]); //add 16 values to 8
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index bd7da2a5d..b9e79c146 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -3859,6 +3859,20 @@ inline void rpp_resize_nn_load_u8pkd3(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _
p = _mm_shuffle_epi8(px[0], xmm_pkd_mask); // Shuffle to obtain 4 RGB [R01|G01|B01|R11|G11|B11|R21|G21|B21|R31|G31|B31|00|00|00|00]
}
+template
+inline void rpp_resize_nn_extract_pkd3_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p)
+{
+ p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[0] + 1), *(srcRowPtrsForInterp + loc[0] + 2),
+ *(srcRowPtrsForInterp + loc[1]), *(srcRowPtrsForInterp + loc[1] + 1), *(srcRowPtrsForInterp + loc[1] + 2),
+ *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[2] + 1), *(srcRowPtrsForInterp + loc[2] + 2),
+ *(srcRowPtrsForInterp + loc[3]), *(srcRowPtrsForInterp + loc[3] + 1), *(srcRowPtrsForInterp + loc[3] + 2),
+ *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[4] + 1), *(srcRowPtrsForInterp + loc[4] + 2),
+ *(srcRowPtrsForInterp + loc[5]), *(srcRowPtrsForInterp + loc[5] + 1), *(srcRowPtrsForInterp + loc[5] + 2),
+ *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[6] + 1), *(srcRowPtrsForInterp + loc[6] + 2),
+ *(srcRowPtrsForInterp + loc[7]), *(srcRowPtrsForInterp + loc[7] + 1), *(srcRowPtrsForInterp + loc[7] + 2),
+ 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p)
{
__m128i px[4];
@@ -3871,6 +3885,16 @@ inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _
p = _mm_unpacklo_epi8(px[0], px[1]); // unpack to obtain [R01|R11|R21|R31|00|00|00|00|00|00|00|00|00|00|00|00]
}
+template
+inline void rpp_resize_nn_extract_pln1_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p)
+{
+ p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]),
+ *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]),
+ *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]),
+ *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]),
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 *p)
{
p[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]); // LOC0 load [R01|G01|B01|R02] - Need RGB 01
@@ -3880,6 +3904,42 @@ inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, R
_MM_TRANSPOSE4_PS(p[0], p[1], p[2], pTemp); // Transpose to obtain RGB in each vector
}
+inline void rpp_resize_nn_load_f32pkd3_to_f32pln3_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p)
+{
+ __m128 p128[8];
+ p128[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]);
+ p128[1] = _mm_loadu_ps(srcRowPtrsForInterp + loc[1]);
+ p128[2] = _mm_loadu_ps(srcRowPtrsForInterp + loc[2]);
+ p128[3] = _mm_loadu_ps(srcRowPtrsForInterp + loc[3]);
+ _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]);
+ p128[4] = _mm_loadu_ps(srcRowPtrsForInterp + loc[4]);
+ p128[5] = _mm_loadu_ps(srcRowPtrsForInterp + loc[5]);
+ p128[6] = _mm_loadu_ps(srcRowPtrsForInterp + loc[6]);
+ p128[7] = _mm_loadu_ps(srcRowPtrsForInterp + loc[7]);
+ _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]);
+ p[0] = _mm256_setr_m128(p128[0], p128[4]);
+ p[1] = _mm256_setr_m128(p128[1], p128[5]);
+ p[2] = _mm256_setr_m128(p128[2], p128[6]);
+}
+
+inline void rpp_resize_nn_load_f16pkd3_to_f32pln3_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p)
+{
+ p[0] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7]));
+
+ p[1] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 1),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 1),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 1),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 1));
+
+ p[2] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 2),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 2),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 2),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 2));
+}
+
inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 &p)
{
__m128 pTemp[4];
@@ -3892,6 +3952,22 @@ inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc,
p = _mm_unpacklo_ps(pTemp[0], pTemp[1]); // Unpack to obtain [R01|R11|R21|R31]
}
+inline void rpp_resize_nn_load_f32pln1_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p)
+{
+ p = _mm256_setr_ps(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]),
+ *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]),
+ *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]),
+ *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]));
+}
+
+inline void rpp_resize_nn_load_f16pln1_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p)
+{
+ p = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]),
+ (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7]));
+}
+
inline void rpp_resize_nn_load_i8pkd3(Rpp8s *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p)
{
__m128i px[4];
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index 16e3a2765..721800c80 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -1944,7 +1944,8 @@ __device__ __forceinline__ float rpp_hip_rng_xorwow_f32(T *xorwowState)
return outFloat - 1; // return 0 <= outFloat < 1
}
-__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(RpptXorwowState *xorwowState, d_float8 *randomNumbersPtr_f8)
+template
+__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(T *xorwowState, d_float8 *randomNumbersPtr_f8)
{
randomNumbersPtr_f8->f1[0] = rpp_hip_rng_xorwow_f32(xorwowState);
randomNumbersPtr_f8->f1[1] = rpp_hip_rng_xorwow_f32(xorwowState);
diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp
index 56d5ea817..ce7450aab 100644
--- a/src/modules/cpu/host_tensor_effects_augmentations.hpp
+++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp
@@ -31,6 +31,7 @@ SOFTWARE.
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"
+#include "kernel/jitter.hpp"
#include "kernel/glitch.hpp"
#include "kernel/water.hpp"
#include "kernel/ricap.hpp"
diff --git a/src/modules/cpu/kernel/jitter.hpp b/src/modules/cpu/kernel/jitter.hpp
new file mode 100644
index 000000000..ec717150a
--- /dev/null
+++ b/src/modules/cpu/kernel/jitter.hpp
@@ -0,0 +1,929 @@
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus jitter_u8_u8_host_tensor(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp8u *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp8u *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth - 1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_u8pkd3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, srcDescPtr->c, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_u8pln3_to_u8pkd3_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_u8_to_u8_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRow + loc);
+ *dstPtrTemp++ = *(srcPtrRow + 1 + loc);
+ *dstPtrTemp++ = *(srcPtrRow + 2 + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8u *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp8u *dstPtrTempChn, *srcPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ __m256i pxRow;
+ rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow);
+ rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow));
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp8u *dstPtrTempChn = dstPtrTemp;
+ Rpp8u *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = *(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus jitter_f32_f32_host_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f32pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 pxRow[4];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ __m256 pRow;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, (srcPtrChannel + loc), &pRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &pRow);
+ dstPtrTemp += 3;
+ }
+#endif
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp32f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f *srcPtrTempChn, *dstPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+
+ for (int c = 0; c < dstDescPtr->c; c++)
+ {
+ __m256 pxRow;
+ rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrTempChn, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTempChn, &pxRow);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32f *dstPtrTempChn = dstPtrTemp;
+ Rpp32f *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = (Rpp32f)*(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus jitter_f16_f16_host_tensor(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp16f *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp16f *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8];
+ __m256 pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f16pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, pxRow);
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt];
+ dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt];
+ dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt];
+ }
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f dstPtrTemp_ps[25];
+ __m256 pxRow[4];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, pxRow);
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32f srcPtrTemp_ps[8], dstPtrTemp_ps[8];
+ Rpp32s loc;
+ __m256 pRow;
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp16f)srcPtrChannel[loc + cnt];
+ }
+
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &pRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pRow);
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+ }
+ dstPtrTemp += 3;
+ }
+#endif
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp16f *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp16f *srcPtrTempChn, *dstPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+
+ for (int c = 0; c < dstDescPtr->c; c++)
+ {
+ Rpp32f dstPtrTemp_ps[8];
+ __m256 pxRow;
+ rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrTempChn, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pxRow);
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ dstPtrTempChn[cnt] = (Rpp16f) dstPtrTemp_ps[cnt];
+ }
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp16f *dstPtrTempChn = dstPtrTemp;
+ Rpp16f *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = (Rpp16f)*(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus jitter_i8_i8_host_tensor(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32u kernelSize = kernelSizeTensor[batchCount];
+ Rpp32u bound = (kernelSize - 1) / 2;
+ Rpp32u heightLimit = roi.xywhROI.roiHeight - bound;
+ Rpp32u offset = batchCount * srcDescPtr->strides.nStride;
+
+ Rpp8s *srcPtrImage, *dstPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp8s *srcPtrChannel, *dstPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+ RpptXorwowStateBoxMuller xorwowState;
+ Rpp32s srcLocArray[8] = {0};
+
+ __m256i pxXorwowStateX[5], pxXorwowStateCounter;
+ rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter);
+ __m256 pKernelSize = _mm256_set1_ps(kernelSize);
+ __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier);
+ __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride);
+ __m256 pHeightLimit = _mm256_set1_ps(heightLimit);
+ __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1);
+ __m256 pBound = _mm256_set1_ps(bound);
+
+ // Jitter with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_i8pkd3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow);
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTempR++ = *(srcPtrChannel + loc);
+ *dstPtrTempG++ = *(srcPtrChannel + 1 + loc);
+ *dstPtrTempB++ = *(srcPtrChannel + 2 + loc);
+ }
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow[3];
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]);
+ rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]);
+ rpp_simd_store(rpp_store24_i8pln3_to_i8pkd3_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = *(srcPtrRowR + loc);
+ *dstPtrTemp++ = *(srcPtrRowG + loc);
+ *dstPtrTemp++ = *(srcPtrRowB + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Jitter without fused output-layout toggle (NHWC -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtrRow, *dstPtrRow;
+ srcPtrRow = srcPtrChannel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i pxRow;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow);
+ rpp_simd_store(rpp_store24_i8_to_i8_avx, dstPtrTemp, pxRow);
+ dstPtrTemp += vectorIncrement;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + loc);
+ *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 1 + loc);
+ *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 2 + loc);
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ // Jitter with fused output-layout toggle (NCHW -> NCHW)
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *dstPtrRow;
+ dstPtrRow = dstPtrChannel;
+ for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++)
+ {
+ Rpp8s *dstPtrTemp;
+ dstPtrTemp = dstPtrRow;
+
+ __m256 pRow = _mm256_set1_ps(dstLocRow);
+ __m256 pCol = avx_pDstLocInit;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp8s *dstPtrTempChn, *srcPtrTempChn;
+ srcPtrTempChn = srcPtrChannel;
+ dstPtrTempChn = dstPtrTemp;
+ compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ __m256i pxRow;
+ rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow);
+ rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow));
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp += vectorIncrementPerChannel;
+ pCol = _mm256_add_ps(avx_p8, pCol);
+ }
+#endif
+ for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++)
+ {
+ Rpp8s *dstPtrTempChn = dstPtrTemp;
+ Rpp8s *srcPtrTempChn = srcPtrChannel;
+ Rpp32s loc;
+ compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc);
+ for(int c = 0; c < srcDescPtr->c; c++)
+ {
+ *dstPtrTempChn = (Rpp8s)*(srcPtrTempChn + loc);
+ srcPtrTempChn += srcDescPtr->strides.cStride;
+ dstPtrTempChn += dstDescPtr->strides.cStride;
+ }
+ dstPtrTemp++;
+ }
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp
index 42e72db98..08eb93674 100644
--- a/src/modules/hip/handlehip.cpp
+++ b/src/modules/hip/handlehip.cpp
@@ -239,7 +239,12 @@ struct HandleImpl
}
hipMalloc(&(this->initHandle->mem.mgpu.rgbArr.rgbmem), sizeof(RpptRGB) * this->nBatchSize);
- hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 8294400); // 3840 x 2160
+
+ /* (600000 + 293 + 128) * 128 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 128)
+ - 600000 is the maximum size that will be required for MMS buffer based on Librispeech dataset
+ - 293 is the size required for storing reduction outputs for 600000 size sample
+ - 128 is the size required for storing cutOffDB values for batch size 128 */
+ hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888);
}
};
diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp
index 53b80c8ee..a5b83715b 100644
--- a/src/modules/hip/hip_tensor_audio_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp
@@ -25,6 +25,7 @@ SOFTWARE.
#ifndef HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
#define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
+#include "kernel/non_silent_region_detection.hpp"
#include "kernel/to_decibels.hpp"
#endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp
index f1da2cdb9..12e80a1f4 100644
--- a/src/modules/hip/hip_tensor_effects_augmentations.hpp
+++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp
@@ -31,6 +31,7 @@ SOFTWARE.
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"
+#include "kernel/jitter.hpp"
#include "kernel/glitch.hpp"
#include "kernel/water.hpp"
#include "kernel/ricap.hpp"
diff --git a/src/modules/hip/kernel/jitter.hpp b/src/modules/hip/kernel/jitter.hpp
new file mode 100644
index 000000000..bbc407cda
--- /dev/null
+++ b/src/modules/hip/kernel/jitter.hpp
@@ -0,0 +1,314 @@
+#include
+#include "rpp_hip_common.hpp"
+#include "rng_seed_stream.hpp"
+
+__device__ __forceinline__ void jitter_roi_and_srclocs_hip_compute(int4 *srcRoiPtr_i4, RpptXorwowStateBoxMuller *xorwowState, uint kernelSize, uint bound, int id_x, int id_y, d_float16 *locSrc_f16)
+{
+ d_float8 widthIncrement_f8, heightIncrement_f8;
+ rpp_hip_rng_8_xorwow_f32(xorwowState, &widthIncrement_f8);
+ rpp_hip_math_multiply8_const(&widthIncrement_f8, &widthIncrement_f8, static_cast(kernelSize));
+ rpp_hip_rng_8_xorwow_f32(xorwowState, &heightIncrement_f8);
+ rpp_hip_math_multiply8_const(&heightIncrement_f8, &heightIncrement_f8, static_cast(kernelSize));
+
+ d_float8 increment_f8, locDst_f8x, locDst_f8y;
+ increment_f8.f4[0] = make_float4(0.0f, 1.0f, 2.0f, 3.0f); // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs
+ increment_f8.f4[1] = make_float4(4.0f, 5.0f, 6.0f, 7.0f);
+ locDst_f8x.f4[0] = static_cast(id_x) + increment_f8.f4[0];
+ locDst_f8x.f4[1] = static_cast(id_x) + increment_f8.f4[1];
+ locDst_f8y.f4[0] = locDst_f8y.f4[1] = (float4)id_y;
+
+ locSrc_f16->f8[0].f4[0] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[0] + widthIncrement_f8.f4[0] - static_cast(bound);
+ locSrc_f16->f8[0].f4[1] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[1] + widthIncrement_f8.f4[1] - static_cast(bound);
+ locSrc_f16->f8[1].f4[0] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[0] + heightIncrement_f8.f4[0] - static_cast(bound);
+ locSrc_f16->f8[1].f4[1] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[1] + heightIncrement_f8.f4[1] - static_cast(bound);
+
+ // Apply boundary checks and adjustments
+ for(int i = 0; i < 8; ++i)
+ {
+ locSrc_f16->f1[i] = fmaxf(fminf(floorf(locSrc_f16->f1[i]), static_cast(srcRoiPtr_i4->z - 1)), 0.0f);
+ locSrc_f16->f1[i + 8] = fmaxf(fminf(floorf(locSrc_f16->f1[i + 8]), static_cast(srcRoiPtr_i4->w - bound)), 0.0f);
+ }
+}
+
+template
+__global__ void jitter_pkd_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3);
+ uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float24 dst_f24;
+ rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24);
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void jitter_pln_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+ uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float8 dst_f8;
+ rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void jitter_pkd3_pln3_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+ uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float24 dst_f24;
+ rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24);
+ rpp_hip_pack_float24_pkd3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void jitter_pln3_pkd3_tensor(T *srcPtr,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ uint *kernelsize,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ uint *xorwowSeedStream,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3);
+ uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x;
+ uint kernelSize = kernelsize[id_z];
+ uint bound = (kernelSize - 1) / 2;
+
+ RpptXorwowStateBoxMuller xorwowState;
+ uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE];
+ xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed;
+ xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed;
+ xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed;
+ xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed;
+ xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed;
+ xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed;
+
+ int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z];
+ d_float16 locSrc_f16;
+ jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16);
+
+ d_float24 dst_f24;
+ rpp_hip_interpolate24_nearest_neighbor_pln3(srcPtr + srcIdx, &srcStridesNCH, &locSrc_f16, &srcRoi_i4, &dst_f24);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_jitter_tensor(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ uint *kernelSizeTensor,
+ RpptXorwowStateBoxMuller *xorwowInitialStatePtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = dstDescPtr->n;
+
+ Rpp32u *xorwowSeedStream;
+ xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1];
+ CHECK_RETURN_STATUS(hipMemcpyAsync(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice, handle.GetStream()));
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(jitter_pkd_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(jitter_pln_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(jitter_pkd3_pln3_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+ hipLaunchKernelGGL(jitter_pln3_pkd3_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ kernelSizeTensor,
+ xorwowInitialStatePtr,
+ xorwowSeedStream,
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/non_silent_region_detection.hpp b/src/modules/hip/kernel/non_silent_region_detection.hpp
new file mode 100644
index 000000000..80511464b
--- /dev/null
+++ b/src/modules/hip/kernel/non_silent_region_detection.hpp
@@ -0,0 +1,426 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - moving mean square kernel device helpers --------------------
+
+// calculate the position in shared memory to avoid bank conflicts
+__host__ __device__ __forceinline__ int compute_pos_in_smem(int pos)
+{
+ return pos + (pos >> 5); // since shared memory banks considered is 32
+}
+
+/* compute prefix sum on the input buffer passed
+ prefix sum of an array is an array where each element is the sum of all previous elements in the input array, inclusive of the current element */
+__device__ __forceinline__ void compute_prefix_sum(float *input, uint bufferLength)
+{
+ int offset = 1;
+ int2 offset_i2 = static_cast(offset);
+ int2 offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+ int threadIdxMul2 = 2 * hipThreadIdx_x;
+ int blockDimMul2 = 2 * hipBlockDim_x;
+
+ /* compute intermediate prefix sums in a up sweep manner
+ (each level in the hierarchy doubles the distance between the pairs of elements being added) */
+ for (int d = bufferLength >> 1; d > 0; d >>= 1)
+ {
+ // syncthreads before proceeding to next iteration
+ __syncthreads();
+ int dMul2 = 2 * d;
+ for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2)
+ {
+ int2 pos_i2 = (offset_i2 * static_cast(idxMul2)) + offsetAB_i2;
+ input[compute_pos_in_smem(pos_i2.y)] += input[compute_pos_in_smem(pos_i2.x)];
+ }
+ offset <<= 1;
+ offset_i2 = static_cast(offset);
+ offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ int last = bufferLength - 1;
+ input[compute_pos_in_smem(last)] = 0;
+ }
+
+ /* compute final prefix sums in a down sweep manner
+ (each level in the hierarchy halves the distance between the pairs of elements being added) */
+ for (int d = 1; d < bufferLength; d <<= 1)
+ {
+ offset >>= 1;
+ offset_i2 = static_cast(offset);
+ offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1);
+ __syncthreads();
+ // syncthreads before proceeding to next iteration
+
+ int dMul2 = 2 * d;
+ for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2)
+ {
+ int2 pos_i2 = offset_i2 * static_cast(idxMul2) + offsetAB_i2;
+ int posA = compute_pos_in_smem(pos_i2.x);
+ int posB = compute_pos_in_smem(pos_i2.y);
+ float t = input[posA];
+ input[posA] = input[posB];
+ input[posB] += t;
+ }
+ }
+ __syncthreads();
+}
+
+// -------------------- Set 1 - moving mean square compute kernel --------------------
+
+__global__ void moving_mean_square_hip_tensor(float *srcPtr,
+ uint nStride,
+ float *mmsArr,
+ int *srcLengthTensor,
+ int outputTileLength,
+ int windowLength,
+ float windowFactor,
+ int inputTileLength)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint srcLength = srcLengthTensor[id_z];
+ uint batchStride = id_z * nStride;
+ int blockStart = hipBlockIdx_x * outputTileLength;
+
+ if (blockStart >= srcLength)
+ return;
+
+ float *input = srcPtr + batchStride;
+ extern __shared__ float squaredPrefixSum_smem[];
+
+ float *inBlockPtr = srcPtr + batchStride + blockStart;
+ float *outBlockPtr = mmsArr + batchStride + blockStart;
+
+ // find the valid output tile length values needed for given block
+ int validOutputTileLength = std::min(outputTileLength, srcLength - blockStart);
+
+ // assign pointers that points to block begin and block end locations
+ float *extendedBlockStart = inBlockPtr - windowLength;
+ float *extendedBlockEnd = inBlockPtr + validOutputTileLength;
+
+ // load input data to shared memory
+ for(int pos = hipThreadIdx_x; pos < inputTileLength; pos += hipBlockDim_x)
+ {
+ float val = 0.0f;
+ auto extendedBlockPtr = extendedBlockStart + pos;
+
+ /* check if extendedBlockPtr is within the valid region of input
+ and load the value from extendedBlockPtr if it is within valid region */
+ if (extendedBlockPtr >= input && extendedBlockPtr < extendedBlockEnd)
+ val = *extendedBlockPtr;
+ squaredPrefixSum_smem[compute_pos_in_smem(pos)] = val * val;
+ }
+
+ // compute prefix sum
+ compute_prefix_sum(squaredPrefixSum_smem, inputTileLength);
+
+ // compute the mms value here
+ for(int pos = hipThreadIdx_x; pos < validOutputTileLength; pos += hipBlockDim_x)
+ outBlockPtr[pos] = windowFactor * ((inBlockPtr[pos] * inBlockPtr[pos]) + squaredPrefixSum_smem[compute_pos_in_smem(windowLength + pos)] - squaredPrefixSum_smem[compute_pos_in_smem(pos + 1)]);
+}
+
+// -------------------- Set 2 - kernels for finding cutoffmag value --------------------
+
+__global__ void max_reduction_hip_tensor(float *srcPtr,
+ uint nStride,
+ float *maxArr,
+ int *srcLengthTensor)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint srcLength = srcLengthTensor[id_z];
+
+ uint srcIdx = id_z * nStride;
+ __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
+ max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads
+
+ if (id_x >= srcLength)
+ return;
+
+ if (id_x + 8 > srcLength)
+ id_x -= (id_x + 8 - srcLength);
+
+ srcIdx += id_x;
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int dstIdx = id_z * hipGridDim_x + hipBlockIdx_x;
+ maxArr[dstIdx] = max_smem[0];
+ }
+}
+
+__global__ void cutoffmag_hip_tensor(float *srcPtr,
+ int maxLength,
+ float *cutOffMagPtr,
+ float cutOff,
+ float referencePower,
+ bool referenceMax)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ // if referenceMax is set to true, perform final max reduction on srcPtr and compute cutOffMag
+ if(referenceMax)
+ {
+ uint srcIdx = id_z * maxLength;
+ __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block
+ max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads
+
+ if (id_x >= maxLength)
+ return;
+
+ srcIdx += id_x;
+ float maxVal = srcPtr[srcIdx];
+ while (id_x < maxLength)
+ {
+ maxVal = fmaxf(maxVal, srcPtr[srcIdx]);
+ id_x += hipBlockDim_x;
+ srcIdx += hipBlockDim_x;
+ }
+ max_smem[hipThreadIdx_x] = maxVal;
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ cutOffMagPtr[id_z] = max_smem[0] * cutOff;
+ }
+ else
+ {
+ if (hipThreadIdx_x == 0)
+ cutOffMagPtr[id_z] = referencePower * cutOff;
+ }
+}
+
+// -------------------- Set 3 - kernels for finding begin and length of NSR in inputs --------------------
+
+__global__ void find_region_hip_tensor(float *srcPtr,
+ uint nStride,
+ int *beginTensor,
+ int *lengthTensor,
+ float *cutOffMagPtr,
+ int *srcLengthTensor,
+ float windowLength)
+{
+ int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+ uint srcLength = srcLengthTensor[id_z];
+ float cutOffMag = cutOffMagPtr[id_z];
+
+ __shared__ int beginResult;
+ __shared__ int endResult;
+ beginResult = srcLength;
+ endResult = 0;
+ __syncthreads();
+
+ int beginIdx = srcLength;
+ int endIdx = 0;
+ uint stridePerSample = id_z * nStride;
+
+ // Find the begin index in src whose value is >= cutOffMag
+ for (int i = id_x; i < srcLength; i += hipBlockDim_x)
+ {
+ uint srcIdx = stridePerSample + i;
+ if (srcPtr[srcIdx] >= cutOffMag)
+ {
+ beginIdx = i;
+ atomicMin(&beginResult, beginIdx);
+ if(beginResult != srcLength)
+ break;
+ }
+ }
+
+ // Find the end index in src whose value is >= cutOffMag
+ for (int i = id_x; i < srcLength; i += hipBlockDim_x)
+ {
+ uint srcIdx = stridePerSample + srcLength - 1 - i;
+ if (srcPtr[srcIdx] >= cutOffMag)
+ {
+ endIdx = srcLength - 1 - i;
+ atomicMax(&endResult, endIdx);
+ if(endResult != 0)
+ break;
+ }
+ }
+
+ // Final store to dst
+ if(hipThreadIdx_x == 0)
+ {
+ if(beginResult == srcLength || endResult == 0)
+ {
+ beginTensor[id_z] = 0;
+ lengthTensor[id_z] = 0;
+ }
+ else
+ {
+ int detectBegin = beginResult;
+ int detectEnd = endResult - beginResult + 1;
+
+ // if both starting index and length of nonsilent region is not 0
+ // adjust the values as per the windowLength
+ if(detectBegin != 0 && detectEnd != 0)
+ {
+ int newBegin = max(detectBegin - (windowLength - 1), 0);
+ detectEnd += detectBegin - newBegin;
+ detectBegin = newBegin;
+ }
+ beginTensor[id_z] = detectBegin;
+ lengthTensor[id_z] = detectEnd;
+ }
+ }
+}
+
+// -------------------- Set 4 - host helpers for kernel executor --------------------
+
+// return the nearest previous power of 2 for the given number
+inline Rpp32s prev_pow2(Rpp32s n)
+{
+ Rpp32s pow2 = 1;
+ while (n - pow2 > pow2)
+ pow2 += pow2;
+
+ return pow2;
+}
+
+// return the nearest next power of 2 for the given number
+inline Rpp32s next_pow2(Rpp32s n)
+{
+ Rpp32s pow2 = 1;
+ while (n > pow2)
+ pow2 += pow2;
+
+ return pow2;
+}
+
+// -------------------- Set 5 - non silent region kernels executor --------------------
+
+RppStatus hip_exec_non_silent_region_detection_tensor(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32s *srcLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
+ Rpp32f cutOffDB,
+ Rpp32s windowLength,
+ Rpp32f referencePower,
+ Rpp32s resetInterval,
+ rpp::Handle& handle)
+{
+ // check if scratch memory size required for moving mean square is within the limits
+ if ((srcDescPtr->n * srcDescPtr->strides.nStride) > MMS_MAX_SCRATCH_MEMORY)
+ return RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE;
+
+ Rpp32f *mmsArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem;
+ Rpp32s maxSharedMemoryInBytes = handle.GetLocalMemorySize();
+ Rpp32s maxSharedMemoryElements = maxSharedMemoryInBytes / sizeof(Rpp32f);
+ Rpp32s kSharedMemBanks = 32;
+ Rpp32s inputTileLength = prev_pow2(maxSharedMemoryElements * kSharedMemBanks / (kSharedMemBanks + 1));
+
+ if (resetInterval > 0 && resetInterval < inputTileLength)
+ {
+ Rpp32s p = prev_pow2(resetInterval);
+ Rpp32s n = next_pow2(resetInterval);
+ if (p > windowLength)
+ inputTileLength = p;
+ else if (n < inputTileLength)
+ inputTileLength = n;
+ }
+
+ Rpp32s sharedMemorySizeInBytes = compute_pos_in_smem(inputTileLength) * sizeof(Rpp32f);
+ Rpp32s outputTileLength = inputTileLength - windowLength;
+ Rpp32f windowFactor = 1.0f / windowLength;
+
+ if (outputTileLength <= 0)
+ return RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH;
+
+ if (sharedMemorySizeInBytes > maxSharedMemoryInBytes)
+ return RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE;
+
+ // launch kernel to compute the values needed for MMS Array
+ Rpp32s globalThreads_x = ceil(static_cast(srcDescPtr->strides.nStride) / outputTileLength);
+ Rpp32s globalThreads_y = 1;
+ Rpp32s globalThreads_z = srcDescPtr->n;
+
+ hipLaunchKernelGGL(moving_mean_square_hip_tensor,
+ dim3(globalThreads_x, globalThreads_y, globalThreads_z),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ sharedMemorySizeInBytes,
+ handle.GetStream(),
+ srcPtr,
+ srcDescPtr->strides.nStride,
+ mmsArr,
+ srcLengthTensor,
+ outputTileLength,
+ windowLength,
+ windowFactor,
+ inputTileLength);
+
+ const Rpp32f cutOff = std::pow(10.0f, cutOffDB * 0.1f);
+ bool referenceMax = (!referencePower);
+ Rpp32f *partialMaxArr = mmsArr + srcDescPtr->n * srcDescPtr->strides.nStride;
+
+ Rpp32s numBlocksPerSample = ceil(static_cast(srcDescPtr->strides.nStride) / (LOCAL_THREADS_X_1DIM * 8));
+ Rpp32s cutOffMagKernelBlockSize = 1;
+ if (referenceMax)
+ {
+ // compute max value in MMS buffer
+ hipLaunchKernelGGL(max_reduction_hip_tensor,
+ dim3(numBlocksPerSample, 1, globalThreads_z),
+ dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ mmsArr,
+ srcDescPtr->strides.nStride,
+ partialMaxArr,
+ srcLengthTensor);
+ cutOffMagKernelBlockSize = 256;
+ }
+ // find the cutoff value in magnitude
+ Rpp32f *cutOffMagPtr = partialMaxArr + globalThreads_z * numBlocksPerSample;
+ hipLaunchKernelGGL(cutoffmag_hip_tensor,
+ dim3(1, 1, globalThreads_z),
+ dim3(cutOffMagKernelBlockSize, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ numBlocksPerSample,
+ cutOffMagPtr,
+ cutOff,
+ referencePower,
+ referenceMax);
+
+ // find the begin and length values of NSR in inputs
+ hipLaunchKernelGGL(find_region_hip_tensor,
+ dim3(1, 1, globalThreads_z),
+ dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ mmsArr,
+ srcDescPtr->strides.nStride,
+ detectedIndexTensor,
+ detectionLengthTensor,
+ cutOffMagPtr,
+ srcLengthTensor,
+ windowLength);
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp
index c0d8c9204..e05404731 100644
--- a/src/modules/rppt_tensor_audio_augmentations.cpp
+++ b/src/modules/rppt_tensor_audio_augmentations.cpp
@@ -281,6 +281,44 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr,
#ifdef GPU_SUPPORT
+/******************** non_silent_region_detection ********************/
+
+RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32s *srcLengthTensor,
+ Rpp32s *detectedIndexTensor,
+ Rpp32s *detectionLengthTensor,
+ Rpp32f cutOffDB,
+ Rpp32s windowLength,
+ Rpp32f referencePower,
+ Rpp32s resetInterval,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+
+ return hip_exec_non_silent_region_detection_tensor(static_cast(srcPtr),
+ srcDescPtr,
+ srcLengthTensor,
+ detectedIndexTensor,
+ detectionLengthTensor,
+ cutOffDB,
+ windowLength,
+ referencePower,
+ resetInterval,
+ rpp::deref(rppHandle));
+ }
+ else
+ {
+ return RPP_ERROR_NOT_IMPLEMENTED;
+ }
+
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
/******************** to_decibels ********************/
RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr,
@@ -325,5 +363,4 @@ RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr,
}
#endif // GPU_SUPPORT
-
-#endif // AUDIO_SUPPORT
+#endif // AUDIO_SUPPORT
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp
index 80e3d3a10..8fc2d00ee 100644
--- a/src/modules/rppt_tensor_effects_augmentations.cpp
+++ b/src/modules/rppt_tensor_effects_augmentations.cpp
@@ -932,6 +932,78 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** jitter ********************/
+
+RppStatus rppt_jitter_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ Rpp32u seed,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+ RpptXorwowStateBoxMuller xorwowInitialState[SIMD_FLOAT_VECTOR_LENGTH];
+ rpp_host_rng_xorwow_f32_initialize_multiseed_stream_boxmuller(xorwowInitialState, seed);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ jitter_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ jitter_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ jitter_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ jitter_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ xorwowInitialState,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -1641,6 +1713,8 @@ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** erase ********************/
+
RppStatus rppt_erase_gpu(RppPtr_t srcPtr,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
@@ -1850,4 +1924,87 @@ RppStatus rppt_glitch_gpu(RppPtr_t srcPtr,
#endif // backend
}
+/******************** jitter ********************/
+
+RppStatus rppt_jitter_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ Rpp32u *kernelSizeTensor,
+ Rpp32u seed,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+
+ RpptXorwowStateBoxMuller xorwowInitialState;
+ xorwowInitialState.x[0] = 0x75BCD15 + seed;
+ xorwowInitialState.x[1] = 0x159A55E5 + seed;
+ xorwowInitialState.x[2] = 0x1F123BB5 + seed;
+ xorwowInitialState.x[3] = 0x5491333 + seed;
+ xorwowInitialState.x[4] = 0x583F19 + seed;
+ xorwowInitialState.counter = 0x64F0C9 + seed;
+ xorwowInitialState.boxMullerFlag = 0;
+ xorwowInitialState.boxMullerExtra = 0.0f;
+
+ RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr;
+ d_xorwowInitialStatePtr = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem);
+ CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice));
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_jitter_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_jitter_tensor((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ kernelSizeTensor,
+ d_xorwowInitialStatePtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt
index bb5987779..23515798b 100644
--- a/utilities/test_suite/CMakeLists.txt
+++ b/utilities/test_suite/CMakeLists.txt
@@ -120,6 +120,15 @@ if(Python3_FOUND)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endif(NIFTI_FOUND)
+ if(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND)
+ if(libsnd_LIBS)
+ add_test(
+ NAME rpp_qa_tests_tensor_audio_hip_all
+ COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runAudioTests.py --qa_mode 1 --batch_size 3
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ )
+ endif(libsnd_LIBS)
+ endif(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND)
elseif( "${BACKEND}" STREQUAL "OCL")
# TBD: Add OCL Tests
diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt
index 891a76353..814b006fb 100644
--- a/utilities/test_suite/HIP/CMakeLists.txt
+++ b/utilities/test_suite/HIP/CMakeLists.txt
@@ -117,21 +117,23 @@ else()
message("-- ${Yellow}Warning: libniftiio must be installed to install ${PROJECT_NAME}/Tensor_voxel_hip successfully!${ColourReset}")
endif()
-if(NOT libsnd_LIBS)
- message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}")
-else()
- message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
- set(COMPILER_FOR_HIP ${ROCM_PATH}/bin/hipcc)
- set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
- include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
- link_directories(${ROCM_PATH}/lib /usr/local/lib)
- include_directories(${SndFile_INCLUDE_DIRS})
- link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
-
- add_executable(Tensor_audio_hip Tensor_audio_hip.cpp)
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
- if(NOT APPLE)
- set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
+if(RPP_AUDIO_SUPPORT)
+ if(NOT libsnd_LIBS)
+ message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_hip successfully!${ColourReset}")
+ else()
+ message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}")
+ set(COMPILER_FOR_HIP ${ROCM_PATH}/bin/hipcc)
+ set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
+ include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include)
+ link_directories(${ROCM_PATH}/lib /usr/local/lib)
+ include_directories(${SndFile_INCLUDE_DIRS})
+ link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/)
+
+ add_executable(Tensor_audio_hip Tensor_audio_hip.cpp)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
+ if(NOT APPLE)
+ set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs)
+ endif()
+ target_link_libraries(Tensor_audio_hip ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
endif()
- target_link_libraries(Tensor_audio_hip ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST})
endif()
diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp
index 1b6014493..ba7a3d46a 100644
--- a/utilities/test_suite/HIP/Tensor_audio_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_audio_hip.cpp
@@ -108,8 +108,7 @@ int main(int argc, char **argv)
if(testCase == 3)
maxDstChannels = 1;
set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes);
- srcDescPtr->numDims = 2;
- dstDescPtr->numDims = 2;
+
// set buffer sizes for src/dst
iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n;
oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n;
@@ -132,7 +131,7 @@ int main(int argc, char **argv)
CHECK_RETURN_STATUS(hipHostMalloc(&srcDims, batchSize * sizeof(RpptImagePatch)));
CHECK_RETURN_STATUS(hipHostMalloc(&dstDims, batchSize * sizeof(RpptImagePatch)));
- Rpp32f *detectedIndex = nullptr, *detectionLength = nullptr;
+ Rpp32s *detectedIndex = nullptr, *detectionLength = nullptr;
if(testCase == 0)
{
CHECK_RETURN_STATUS(hipHostMalloc(&detectedIndex, batchSize * sizeof(Rpp32f)));
@@ -160,6 +159,19 @@ int main(int argc, char **argv)
double wallTime;
switch (testCase)
{
+ case 0:
+ {
+ testCaseName = "non_silent_region_detection";
+ Rpp32f cutOffDB = -60.0;
+ Rpp32s windowLength = 2048;
+ Rpp32f referencePower = 0.0f;
+ Rpp32s resetInterval = 8192;
+
+ startWallTime = omp_get_wtime();
+ rppt_non_silent_region_detection_gpu(d_inputf32, srcDescPtr, srcLengthTensor, detectedIndex, detectionLength, cutOffDB, windowLength, referencePower, resetInterval, handle);
+
+ break;
+ }
case 1:
{
testCaseName = "to_decibels";
@@ -203,11 +215,14 @@ int main(int argc, char **argv)
if (testType == 0)
{
CHECK_RETURN_STATUS(hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost));
+ CHECK_RETURN_STATUS(hipDeviceSynchronize());
/* Run only if testCase is not 0
For testCase 0 verify_non_silent_region_detection function is used for QA testing */
if (testCase != 0)
verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HIP");
+ else
+ verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst);
/* Dump the outputs to csv files for debugging
Runs only if
@@ -240,14 +255,14 @@ int main(int argc, char **argv)
cout << endl;
// free memory
- free(srcDims);
- free(dstDims);
free(inputf32);
free(outputf32);
CHECK_RETURN_STATUS(hipFree(d_inputf32));
CHECK_RETURN_STATUS(hipFree(d_outputf32));
CHECK_RETURN_STATUS(hipHostFree(srcLengthTensor));
CHECK_RETURN_STATUS(hipHostFree(channelsTensor));
+ CHECK_RETURN_STATUS(hipHostFree(srcDims));
+ CHECK_RETURN_STATUS(hipHostFree(dstDims));
if (detectedIndex != nullptr)
CHECK_RETURN_STATUS(hipHostFree(detectedIndex));
if (detectionLength != nullptr)
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index aad78241e..ec1b47d9b 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -66,7 +66,8 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79);
bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
- bool randomOutputCase = (testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54);
+ bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54);
+ bool nonQACase = (testCase == 24);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
bool noiseTypeCase = (testCase == 8);
@@ -406,6 +407,10 @@ int main(int argc, char **argv)
if(testCase == 46)
CHECK_RETURN_STATUS(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f)));
+ Rpp32u *kernelSizeTensor;
+ if(testCase == 6)
+ CHECK_RETURN_STATUS(hipHostMalloc(&kernelSizeTensor, batchSize * sizeof(Rpp32u)));
+
RpptChannelOffsets *rgbOffsets;
if(testCase == 35)
CHECK_RETURN_STATUS(hipHostMalloc(&rgbOffsets, batchSize * sizeof(RpptChannelOffsets)));
@@ -561,6 +566,22 @@ int main(int argc, char **argv)
break;
}
+ case 6:
+ {
+ testCaseName = "jitter";
+
+ Rpp32u seed = 1255459;
+ for (i = 0; i < batchSize; i++)
+ kernelSizeTensor[i] = 5;
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_jitter_gpu(d_input, srcDescPtr, d_output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 8:
{
testCaseName = "noise";
@@ -709,6 +730,36 @@ int main(int argc, char **argv)
break;
}
+ case 24:
+ {
+ testCaseName = "warp_affine";
+
+ if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR))
+ {
+ missingFuncFlag = 1;
+ break;
+ }
+
+ Rpp32f6 affineTensor_f6[batchSize];
+ Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6;
+ for (i = 0; i < batchSize; i++)
+ {
+ affineTensor_f6[i].data[0] = 1.23;
+ affineTensor_f6[i].data[1] = 0.5;
+ affineTensor_f6[i].data[2] = 0;
+ affineTensor_f6[i].data[3] = -0.8;
+ affineTensor_f6[i].data[4] = 0.83;
+ affineTensor_f6[i].data[5] = 0;
+ }
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_warp_affine_gpu(d_input, srcDescPtr, d_output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 26:
{
testCaseName = "lens_correction";
@@ -1448,7 +1499,7 @@ int main(int argc, char **argv)
1.QA Flag is set
2.input bit depth 0 (U8)
3.source and destination layout are the same*/
- if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
+ if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase))
{
if (testCase == 87)
compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
@@ -1516,7 +1567,7 @@ int main(int argc, char **argv)
2.input bit depth 0 (Input U8 && Output U8)
3.source and destination layout are the same
4.augmentation case does not generate random output*/
- if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase))
+ if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase))
compare_output(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath);
// Calculate exact dstROI in XYWH format for OpenCV dump
@@ -1603,6 +1654,8 @@ int main(int argc, char **argv)
CHECK_RETURN_STATUS(hipHostFree(shapeTensor));
if(roiTensor != NULL)
CHECK_RETURN_STATUS(hipHostFree(roiTensor));
+ if(testCase == 6)
+ CHECK_RETURN_STATUS(hipHostFree(kernelSizeTensor));
free(input);
free(input_second);
free(output);
diff --git a/utilities/test_suite/HIP/runAudioTests.py b/utilities/test_suite/HIP/runAudioTests.py
index e55010b38..f500b7f0e 100644
--- a/utilities/test_suite/HIP/runAudioTests.py
+++ b/utilities/test_suite/HIP/runAudioTests.py
@@ -35,7 +35,7 @@
inFilePath = scriptPath + "/../TEST_AUDIO_FILES/three_samples_single_channel_src1"
outFolderPath = os.getcwd()
buildFolderPath = os.getcwd()
-caseMin = 1
+caseMin = 0
caseMax = 1
@@ -224,7 +224,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['1']
+supportedCaseList = ['0', '1']
if qaMode and batchSize != 3:
print("QA tests can only run with a batch size of 3.")
exit(0)
diff --git a/utilities/test_suite/HIP/runMiscTests.py b/utilities/test_suite/HIP/runMiscTests.py
index f4adbde28..ee97f4547 100644
--- a/utilities/test_suite/HIP/runMiscTests.py
+++ b/utilities/test_suite/HIP/runMiscTests.py
@@ -74,24 +74,25 @@ def generate_performance_reports(RESULTS_DIR):
print(dfPrint_noIndices)
def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
- print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
- with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
- print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_misc_hip_raw_performance_log.txt", "a") as logFile:
+ logFile.write("./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n")
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
def run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
- if not os.path.exists(f"{outFilePath}/case_{case}"):
- os.mkdir(f"{outFilePath}/case_{case}")
+ if not os.path.exists(outFilePath + "/case_" + str(case)):
+ os.mkdir(outFilePath + "/case_" + str(case))
- with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
- print(f"\nrocprof --basenames on --timestamp on --stats -o {outFilePath}/case_{case}/output_case{case}.csv ./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
- process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{outFilePath}/case_{case}/output_case{case}.csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_misc_hip_raw_performance_log.txt", "a") as logFile:
+ logFile.write("\nrocprof --basenames on --timestamp on --stats -o " + outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n")
+ process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
print("------------------------------------------------------------------------------------------")
@@ -206,8 +207,8 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
supportedCaseList = ['0', '1', '2']
for case in caseList:
@@ -253,7 +254,7 @@ def rpp_test_suite_parser_and_validator():
continue
new_file.close()
- subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), CONSOLIDATED_FILE]) # nosec
+ subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), CONSOLIDATED_FILE]) # nosec
try:
generate_performance_reports(RESULTS_DIR)
except ImportError:
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 01da79c8d..cb4bc8bda 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -70,35 +70,39 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
if case == "40" or case == "41" or case == "49" or case == "54":
for kernelSize in range(3, 10, 2):
- print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {kernelSize}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(kernelSize), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(kernelSize))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(kernelSize), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
elif case == "8":
# Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
for noiseType in range(3):
- print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {noiseType} ")
- result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(noiseType))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
elif case == "21" or case == "23" or case == "24" or case == "79":
# Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
interpolationRange = 6
if case =='79':
interpolationRange = 2
for interpolationType in range(interpolationRange):
- print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {interpolationType}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(interpolationType))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
else:
- print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} 0 {numRuns} {testType} {layout}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " 0 " + str(numRuns) + " " + str(testType) + " " + str(layout))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
- with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
- print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam))
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
@@ -133,11 +137,11 @@ def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPa
def run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
addtionalParamString = additionalParamType + str(additionalParam)
layoutName = get_layout_name(layout)
- if not os.path.isdir(f"{dstPath}/Tensor_{layoutName}/case_{case}"):
- os.mkdir(f"{dstPath}/Tensor_{layoutName}/case_{case}")
- with open(f"{loggingFolder}/Tensor_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
- print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0')
- process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
+ if not os.path.isdir(dstPath + "/Tensor_" + layoutName + "/case_" + str(case)):
+ os.makedirs(dstPath + "/Tensor_" + layoutName + "/case_" + str(case))
+ with open(loggingFolder + "/Tensor_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ logFile.write("rocprof --basenames on --timestamp on --stats -o " + dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + "_bitDepth" + str(bitDepth) + "_oft" + addtionalParamString + ".csv ./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam) + " 0\n")
+ process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + "_bitDepth" + str(bitDepth) + "_oft" + addtionalParamString + ".csv", buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
while True:
output = process.stdout.readline()
if not output and process.poll() is not None:
@@ -172,7 +176,7 @@ def rpp_test_suite_parser_and_validator():
# validate the parameters passed by user
if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
- print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!")
+ print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
print("Ending case# must be greater than starting case#. Aborting!")
@@ -214,7 +218,7 @@ def rpp_test_suite_parser_and_validator():
else:
for case in args.case_list:
if int(case) < caseMin or int(case) > caseMax:
- print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!")
+ print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
exit(0)
return args
@@ -272,17 +276,17 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
- os.makedirs(f"{dstPath}/Tensor_PKD3")
- os.makedirs(f"{dstPath}/Tensor_PLN1")
- os.makedirs(f"{dstPath}/Tensor_PLN3")
+ os.makedirs(dstPath + "/Tensor_PKD3")
+ os.makedirs(dstPath + "/Tensor_PLN1")
+ os.makedirs(dstPath + "/Tensor_PLN3")
print("\n\n\n\n\n")
print("##########################################################################################")
@@ -453,7 +457,7 @@ def rpp_test_suite_parser_and_validator():
continue
new_file.close()
- subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec
+ subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec
try:
generate_performance_reports(d_counter, TYPE_LIST, RESULTS_DIR)
@@ -484,7 +488,7 @@ def rpp_test_suite_parser_and_validator():
print_performance_tests_summary(logFile, functionalityGroupList, numRuns)
# print the results of qa tests
-nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
+nonQACaseList = ['6', '8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
qaFilePath = os.path.join(outFilePath, "QA_results.txt")
diff --git a/utilities/test_suite/HIP/runVoxelTests.py b/utilities/test_suite/HIP/runVoxelTests.py
index f3ad38025..31c9dd22f 100644
--- a/utilities/test_suite/HIP/runVoxelTests.py
+++ b/utilities/test_suite/HIP/runVoxelTests.py
@@ -57,20 +57,23 @@ def func_group_finder(case_number):
return "miscellaneous"
def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
- print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
- with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
- print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_voxel_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ logFile.write("./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n")
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
while True:
output = process.stdout.readline()
if not output and process.poll() is not None:
break
- print(output.strip())
+ output = output.decode().strip() # Decode bytes to string and strip extra whitespace
+ print(output)
+ logFile.write(output)
if "Running" in output or "max,min,avg wall times" in output:
cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126) # Remove control characters
cleanedOutput = cleanedOutput.strip() # Remove leading/trailing whitespace
@@ -81,14 +84,15 @@ def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath,
def run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
layoutName = get_layout_name(layout)
- if not os.path.exists(f"{loggingFolder}/Tensor_{layoutName}/case_{case}"):
- os.mkdir(f"{loggingFolder}/Tensor_{layoutName}/case_{case}")
+ directory_path = os.path.join(loggingFolder, "Tensor_" + layoutName, "case_" + str(case))
+ if not os.path.exists(directory_path):
+ os.mkdir(directory_path)
bitDepths = [0, 2]
for bitDepth in bitDepths:
- with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
- print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_{layoutName}/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
- process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
+ with open(loggingFolder + "/Tensor_voxel_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ logFile.write("\nrocprof --basenames on --timestamp on --stats -o " + dstPathTemp + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n")
+ process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + ".csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
while True:
output = process.stdout.readline()
if not output and process.poll() is not None:
@@ -227,17 +231,17 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
supportedCaseList = ['0', '1', '2', '3', '4', '5', '6']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
- os.makedirs(f"{dstPath}/Tensor_PKD3")
- os.makedirs(f"{dstPath}/Tensor_PLN1")
- os.makedirs(f"{dstPath}/Tensor_PLN3")
+ os.makedirs(dstPath + "/Tensor_PKD3")
+ os.makedirs(dstPath + "/Tensor_PLN1")
+ os.makedirs(dstPath + "/Tensor_PLN3")
print("\n\n\n\n\n")
print("##########################################################################################")
@@ -322,7 +326,7 @@ def rpp_test_suite_parser_and_validator():
continue
new_file.close()
- subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec
+ subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec
try:
generate_performance_reports(d_counter, TYPE_LIST, RESULTS_DIR)
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index 4c3d4f0e8..bb1312a5e 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -66,7 +66,8 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
- bool randomOutputCase = (testCase == 8 || testCase == 84);
+ bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84);
+ bool nonQACase = (testCase == 24);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79);
bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91);
bool noiseTypeCase = (testCase == 8);
@@ -517,6 +518,24 @@ int main(int argc, char **argv)
break;
}
+ case 6:
+ {
+ testCaseName = "jitter";
+
+ Rpp32u kernelSizeTensor[batchSize];
+ Rpp32u seed = 1255459;
+ for (i = 0; i < batchSize; i++)
+ kernelSizeTensor[i] = 5;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_jitter_host(input, srcDescPtr, output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 8:
{
testCaseName = "noise";
@@ -672,6 +691,37 @@ int main(int argc, char **argv)
break;
}
+ case 24:
+ {
+ testCaseName = "warp_affine";
+
+ if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR))
+ {
+ missingFuncFlag = 1;
+ break;
+ }
+
+ Rpp32f6 affineTensor_f6[batchSize];
+ Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6;
+ for (i = 0; i < batchSize; i++)
+ {
+ affineTensor_f6[i].data[0] = 1.23;
+ affineTensor_f6[i].data[1] = 0.5;
+ affineTensor_f6[i].data[2] = 0;
+ affineTensor_f6[i].data[3] = -0.8;
+ affineTensor_f6[i].data[4] = 0.83;
+ affineTensor_f6[i].data[5] = 0;
+ }
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_warp_affine_host(input, srcDescPtr, output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 26:
{
testCaseName = "lens_correction";
@@ -1462,7 +1512,7 @@ int main(int argc, char **argv)
1.QA Flag is set
2.input bit depth 0 (U8)
3.source and destination layout are the same*/
- if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
+ if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase))
{
if (testCase == 87)
compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
@@ -1528,7 +1578,7 @@ int main(int argc, char **argv)
2.input bit depth 0 (Input U8 && Output U8)
3.source and destination layout are the same
4.augmentation case does not generate random output*/
- if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase))
+ if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase))
compare_output(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath);
// Calculate exact dstROI in XYWH format for OpenCV dump
diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py
index c0600c057..a1771716b 100644
--- a/utilities/test_suite/HOST/runAudioTests.py
+++ b/utilities/test_suite/HOST/runAudioTests.py
@@ -45,15 +45,16 @@ def get_log_file_list():
]
def run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath):
- print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_audio_host " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath):
- with open("{}/Tensor_audio_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
- print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize} ")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_audio_host_raw_performance_log.txt", "a") as logFile:
+ logFile.write("./Tensor_audio_host " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize) + "\n")
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
print("------------------------------------------------------------------------------------------")
@@ -87,7 +88,7 @@ def rpp_test_suite_parser_and_validator():
# validate the parameters passed by user
if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
- print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!")
+ print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
print("Ending case# must be greater than starting case#. Aborting!")
@@ -120,7 +121,7 @@ def rpp_test_suite_parser_and_validator():
else:
for case in args.case_list:
if int(case) < caseMin or int(case) > caseMax:
- print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!")
+ print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
exit(0)
return args
@@ -171,8 +172,8 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
supportedCaseList = ['0', '1', '2', '3', '4', '5', '6', '7']
diff --git a/utilities/test_suite/HOST/runMiscTests.py b/utilities/test_suite/HOST/runMiscTests.py
index 0f428fe40..931838f71 100644
--- a/utilities/test_suite/HOST/runMiscTests.py
+++ b/utilities/test_suite/HOST/runMiscTests.py
@@ -47,15 +47,16 @@ def get_log_file_list():
]
def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
- print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_misc_host " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg):
- with open("{}/Tensor_misc_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile:
- print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_misc_host_raw_performance_log.txt", "a") as logFile:
+ logFile.write("./Tensor_misc_host " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n")
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg = ""):
@@ -162,8 +163,8 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
supportedCaseList = ['0', '1', '2']
for case in caseList:
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index 93cd64713..7386b364b 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -71,34 +71,37 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo
if case == "8":
# Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise
for noiseType in range(3):
- print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {noiseType} 0 ")
- result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(noiseType) + " 0")
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
elif case == "21" or case == "23" or case == "24" or case == "79":
# Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular
interpolationRange = 6
if case =='79':
interpolationRange = 2
for interpolationType in range(interpolationRange):
- print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {interpolationType} 0")
- result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(interpolationType) + " 0")
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
else:
- print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} 0 {numRuns} {testType} {layout} 0")
- result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " 0 " + str(numRuns) + " " + str(testType) + " " + str(layout) + " 0")
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
if qaMode == 1:
- with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
- process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/BatchPD_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
- with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile:
- print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ logFile.write("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam) + " 0\n")
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
read_from_subprocess_and_write_to_log(process, logFile)
def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList):
@@ -154,7 +157,7 @@ def rpp_test_suite_parser_and_validator():
# validate the parameters passed by user
if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)):
- print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!")
+ print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
print("Ending case# must be greater than starting case#. Aborting!")
@@ -193,7 +196,7 @@ def rpp_test_suite_parser_and_validator():
else:
for case in args.case_list:
if int(case) < caseMin or int(case) > caseMax:
- print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!")
+ print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!")
exit(0)
return args
@@ -254,11 +257,11 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
+supportedCaseList = ['0', '1', '2', '4', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92']
print("\n\n\n\n\n")
print("##########################################################################################")
@@ -309,7 +312,7 @@ def rpp_test_suite_parser_and_validator():
run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
# print the results of qa tests
-nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
+nonQACaseList = ['6', '8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
qaFilePath = os.path.join(outFilePath, "QA_results.txt")
@@ -443,23 +446,23 @@ def rpp_test_suite_parser_and_validator():
passedCases = df['Test_Result'].eq('PASSED').sum()
failedCases = df['Test_Result'].eq('FAILED').sum()
- summaryRow = {'BatchPD_Augmentation_Type': pd.NA,
- 'Tensor_Augmentation_Type': pd.NA,
- 'Performance Speedup (%)': pd.NA,
- 'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'}
+ summaryRow = {'BatchPD_Augmentation_Type': None,
+ 'Tensor_Augmentation_Type': None,
+ 'Performance Speedup (%)': None,
+ 'Test_Result': 'Final Results of Tests: Passed: ' + str(passedCases) + ', Failed: ' + str(failedCases)}
- print("\n", df.to_markdown())
+ print("\n" + dataframe_to_markdown(df))
# Append the summary row to the DataFrame
# Convert the dictionary to a DataFrame
summaryRow = pd.DataFrame([summaryRow])
- df = pd.concat([df, summaryRow], ignore_index=True)
+ df = pd.concat([df, summaryRow], ignore_index=True, sort = True)
df.to_excel(excelFilePath, index=False)
print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------")
print("\nIMPORTANT NOTE:")
print("- The following performance comparison shows Performance Speedup percentages between times measured on previous generation RPP-BatchPD APIs against current generation RPP-Tensor APIs.")
- print(f"- All APIs have been improved for performance ranging from {0}% (almost same) to {100}% faster.")
+ print("- All APIs have been improved for performance ranging from " + str(0) + "% (almost same) to " + str(100) + "% faster.")
print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.")
print("\n-------------------------------------------------------------------\n")
elif (testType == 1 and qaMode == 0):
diff --git a/utilities/test_suite/HOST/runVoxelTests.py b/utilities/test_suite/HOST/runVoxelTests.py
index 3dbe0baa5..f44c05f78 100644
--- a/utilities/test_suite/HOST/runVoxelTests.py
+++ b/utilities/test_suite/HOST/runVoxelTests.py
@@ -58,20 +58,23 @@ def func_group_finder(case_number):
return "miscellaneous"
def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
- print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
- result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
- print(result.stdout.decode())
+ print("./Tensor_voxel_host " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth))
+ result = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec
+ stdout_data, stderr_data = result.communicate()
+ print(stdout_data.decode())
print("------------------------------------------------------------------------------------------")
def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize):
- with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as logFile:
- print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}")
- process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec
+ with open(loggingFolder + "/Tensor_voxel_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile:
+ logFile.write("./Tensor_voxel_host " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n")
+ process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec
while True:
output = process.stdout.readline()
if not output and process.poll() is not None:
break
- print(output.strip())
+ output = output.decode().strip() # Decode bytes to string and strip extra whitespace
+ print(output)
+ logFile.write(output)
if "Running" in output or "max,min,avg wall times" in output:
cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126) # Remove control characters
cleanedOutput = cleanedOutput.strip() # Remove leading/trailing whitespace
@@ -203,8 +206,8 @@ def rpp_test_suite_parser_and_validator():
os.chdir(buildFolderPath + "/build")
# Run cmake and make commands
-subprocess.run(["cmake", scriptPath], cwd=".") # nosec
-subprocess.run(["make", "-j16"], cwd=".") # nosec
+subprocess.call(["cmake", scriptPath], cwd=".") # nosec
+subprocess.call(["make", "-j16"], cwd=".") # nosec
# List of cases supported
supportedCaseList = ['0', '1', '2', '3', '4', '5', '6']
diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py
index 699495b39..a0f37ffa2 100644
--- a/utilities/test_suite/common.py
+++ b/utilities/test_suite/common.py
@@ -27,6 +27,7 @@
import sys
import datetime
import shutil
+import pandas as pd
try:
from errno import FileExistsError
@@ -179,7 +180,7 @@ def case_file_check(CASE_FILE_PATH, TYPE, TENSOR_TYPE_LIST, new_file, d_counter)
def directory_name_generator(qaMode, affinity, layoutType, case, path, func_group_finder):
if qaMode == 0:
functionality_group = func_group_finder(int(case))
- dst_folder_temp = f"{path}/rpp_{affinity}_{layoutType}_{functionality_group}"
+ dst_folder_temp = path + "/rpp_" + affinity + "_" + layoutType + "_" + functionality_group
else:
dst_folder_temp = path
@@ -360,3 +361,22 @@ def func_group_finder(case_number):
if case_number in value:
return key
return "miscellaneous"
+
+def dataframe_to_markdown(df):
+ # Calculate the maximum width of each column
+ column_widths = {}
+ for col in df.columns:
+ max_length = len(col)
+ for value in df[col]:
+ max_length = max(max_length, len(str(value)))
+ column_widths[col] = max_length
+
+ # Create the header row
+ md = '| ' + ' | '.join([col.ljust(column_widths[col]) for col in df.columns]) + ' |\n'
+ md += '| ' + ' | '.join(['-' * column_widths[col] for col in df.columns]) + ' |\n'
+
+ # Create the data rows
+ for i, row in df.iterrows():
+ md += '| ' + ' | '.join([str(value).ljust(column_widths[df.columns[j]]) for j, value in enumerate(row.values)]) + ' |\n'
+
+ return md
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 71ca9fb34..eddf78702 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -75,11 +75,13 @@ std::map augmentationMap =
{1, "gamma_correction"},
{2, "blend"},
{4, "contrast"},
+ {6, "jitter"},
{8, "noise"},
{13, "exposure"},
{20, "flip"},
{21, "resize"},
{23, "rotate"},
+ {24, "warp_afffine"},
{26, "lens_correction"},
{29, "water"},
{30, "non_linear_blend"},
diff --git a/utilities/test_suite/rpp_test_suite_misc.h b/utilities/test_suite/rpp_test_suite_misc.h
index 0a4197caa..9ef118c48 100644
--- a/utilities/test_suite/rpp_test_suite_misc.h
+++ b/utilities/test_suite/rpp_test_suite_misc.h
@@ -98,6 +98,7 @@ void fill_roi_values(Rpp32u nDim, Rpp32u batchSize, Rpp32u *roiTensor, bool qaMo
case 3:
{
std::array roi = {0, 0, 0, 50, 50, 8};
+ for(int i = 0, j = 0; i < batchSize ; i++, j += 6)
std::copy(roi.begin(), roi.end(), &roiTensor[j]);
break;
exit(0);