diff --git a/.Doxyfile b/.Doxyfile index 066a53c02..dac8a3acc 100644 --- a/.Doxyfile +++ b/.Doxyfile @@ -960,16 +960,16 @@ INPUT = README.md \ include/rppi_logical_operations.h \ include/rppi_morphological_transforms.h \ include/rppi_statistical_operations.h \ + include/rppt_tensor_arithmetic_operations.h \ + include/rppt_tensor_audio_augmentations.h \ include/rppt_tensor_color_augmentations.h \ include/rppt_tensor_data_exchange_operations.h \ include/rppt_tensor_effects_augmentations.h \ include/rppt_tensor_filter_augmentations.h \ include/rppt_tensor_geometric_augmentations.h \ + include/rppt_tensor_logical_operations.h \ include/rppt_tensor_morphological_operations.h \ - include/rppt_tensor_statistical_operations.h \ - include/rppt_tensor_arithmetic_operations.h \ - include/rppt_tensor_audio_augmentations.h \ - include/rppt_tensor_logical_operations.h + include/rppt_tensor_statistical_operations.h # This tag can be used to specify the character encoding of the source files @@ -2381,7 +2381,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE +PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png new file mode 100644 index 000000000..8aef1cbe6 Binary files /dev/null and b/docs/data/doxygenOutputs/effects_augmentations_jitter_150x150.png differ diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile index 18d9a73bc..9773637df 100644 --- a/docs/doxygen/Doxyfile +++ b/docs/doxygen/Doxyfile @@ -962,14 +962,16 @@ INPUT = ../../README.md \ ../../include/rppi_logical_operations.h \ ../../include/rppi_morphological_transforms.h \ ../../include/rppi_statistical_operations.h \ + ../../include/rppt_tensor_arithmetic_operations.h \ + ../../include/rppt_tensor_audio_augmentations.h \ ../../include/rppt_tensor_color_augmentations.h \ ../../include/rppt_tensor_data_exchange_operations.h \ ../../include/rppt_tensor_effects_augmentations.h \ ../../include/rppt_tensor_filter_augmentations.h \ ../../include/rppt_tensor_geometric_augmentations.h \ + ../../include/rppt_tensor_logical_operations.h \ ../../include/rppt_tensor_morphological_operations.h \ - ../../include/rppt_tensor_statistical_operations.h \ - ../../include/rppt_tensor_logical_operations.h + ../../include/rppt_tensor_statistical_operations.h # This tag can be used to specify the character encoding of the source files @@ -2381,7 +2383,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE +PREDEFINED = GPU_SUPPORT RPP_BACKEND_HIP HIP_COMPILE AUDIO_SUPPORT # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index a88668ba5..c316de276 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==1.5.0 +rocm-docs-core[api_reference]==1.5.1 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 54fbfde32..2c9286b18 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -110,7 +110,7 @@ requests==2.28.2 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.5.0 +rocm-docs-core[api-reference]==1.5.1 # via -r requirements.in smmap==5.0.0 # via gitdb diff --git a/include/rppdefs.h b/include/rppdefs.h index e863ed6ad..6eb025665 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -64,6 +64,7 @@ SOFTWARE. const float ONE_OVER_6 = 1.0f / 6; const float ONE_OVER_3 = 1.0f / 3; const float ONE_OVER_255 = 1.0f / 255; +const uint MMS_MAX_SCRATCH_MEMORY = 76800000; // maximum scratch memory size (number of floats) needed for MMS buffer in RNNT training /******************** RPP typedefs ********************/ @@ -137,8 +138,14 @@ typedef enum RPP_ERROR_LAYOUT_MISMATCH = -18, /*! \brief Number of channels is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ RPP_ERROR_INVALID_CHANNELS = -19, + /*! \brief Invalid output tile length (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH = -20, + /*! \brief Shared memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE = -21, + /*! \brief Scratch memory size needed is beyond the bounds (Needs to adhere to function specification.) \ingroup group_rppdefs */ + RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE = -22, /*! \brief Number of src dims is invalid. (Needs to adhere to function specification.) \ingroup group_rppdefs */ - RPP_ERROR_INVALID_SRC_DIMS = -20 + RPP_ERROR_INVALID_SRC_DIMS = -23 } RppStatus; /*! \brief RPP rppStatus_t type enums diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h index 4ffd24156..88a8d76a2 100644 --- a/include/rppt_tensor_arithmetic_operations.h +++ b/include/rppt_tensor_arithmetic_operations.h @@ -190,7 +190,7 @@ RppStatus rppt_subtract_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. */ -RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *subtractTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); +RppStatus rppt_multiply_scalar_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32f *mulTensor, RpptROI3DPtr roiGenericPtrSrc, RpptRoi3DType roiType, rppHandle_t rppHandle); #ifdef GPU_SUPPORT /*! \brief Multiply scalar augmentation on HIP backend @@ -226,7 +226,7 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -248,7 +248,7 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index 71b7282e4..0a693baa9 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -64,6 +64,28 @@ extern "C" { */ RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); +#ifdef GPU_SUPPORT +/*! \brief Non Silent Region Detection augmentation on HIP backend + * \details Non Silent Region Detection augmentation for 1D audio buffer + \n Finds the starting index and length of non silent region in the audio buffer by comparing the + calculated short-term power with cutoff value passed + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [in] srcLengthTensor source audio buffer length (1D tensor in Pinned/HIP memory, of size batchSize) + * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in Pinned/HIP memory, of size batchSize) + * \param [out] detectionLengthTensor length of non silent region (1D tensor in Pinned/HIP memory, of size batchSize) + * \param [in] cutOffDB cutOff in dB below which the signal is considered silent + * \param [in] windowLength window length used for computing short-term power of the signal + * \param [in] referencePower reference power that is used to convert the signal to dB + * \param [in] resetInterval number of samples after which the moving mean average is recalculated to avoid precision loss + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp32s *srcLengthTensor, Rpp32s *detectedIndexTensor, Rpp32s *detectionLengthTensor, Rpp32f cutOffDB, Rpp32s windowLength, Rpp32f referencePower, Rpp32s resetInterval, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief To Decibels augmentation on HOST backend * \details To Decibels augmentation for 1D/2D audio buffer converts magnitude values to decibel values * \param [in] srcPtr source tensor in HOST memory @@ -174,15 +196,15 @@ RppStatus rppt_mel_filter_bank_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp /*! \brief Resample augmentation on HOST backend * \details Resample augmentation for audio data -* \param[in] srcPtr source tensor in HOST memory -* \param[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) -* \param[out] dstPtr destination tensor in HOST memory -* \param[in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) -* \param[in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize) -* \param[in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize) -* \param[in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) -* \param[in] window Resampling window (struct of type RpptRpptResamplingWindow) -* \param[in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() +* \param [in] srcPtr source tensor in HOST memory +* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param [out] dstPtr destination tensor in HOST memory +* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param [in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize) +* \param [in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize) +* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) +* \param [in] window Resampling window (struct of type RpptRpptResamplingWindow) +* \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. * \retval RPP_SUCCESS Successful completion. * \retval RPP_ERROR* Unsuccessful completion. diff --git a/include/rppt_tensor_color_augmentations.h b/include/rppt_tensor_color_augmentations.h index 3b39448eb..62ef13715 100644 --- a/include/rppt_tensor_color_augmentations.h +++ b/include/rppt_tensor_color_augmentations.h @@ -54,7 +54,7 @@ extern "C" { * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch) * \param [in] betaTensor beta values for brightness calculation (1D tensor in HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -76,7 +76,7 @@ RppStatus rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= alpha <= 20 for each image in batch) * \param [in] betaTensor beta values for brightness calculation (1D tensor in pinned/HOST memory, of size batchSize, with 0 <= beta <= 255 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -97,7 +97,7 @@ RppStatus rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in HOST memory, of size batchSize with gamma >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -118,7 +118,7 @@ RppStatus rppt_gamma_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rp * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] gammaTensor gamma values for gamma correction calculation (1D tensor in pinned/HOST memory, of size batchSize with gamma >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -141,7 +141,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for alpha-blending (1D tensor in HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -164,7 +164,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] alphaTensor alpha values for alpha-blending (1D tensor in pinned/HOST memory, of size batchSize with the transparency factor transparency factor 0 <= alpha <= 1 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -188,7 +188,7 @@ RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDesc * \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch) * \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch) * \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -212,7 +212,7 @@ RppStatus rppt_color_twist_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ * \param [in] contrastTensor contrast modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch) * \param [in] hueTensor hue modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch) * \param [in] saturationTensor saturation modification parameter for color_twist calculation (1D tensor in pinned/HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -236,7 +236,7 @@ RppStatus rppt_color_twist_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] contrastTensor contrast modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 < contrastTensor[i] <= 255 for each image in batch) * \param [in] hueTensor hue modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with 0 <= hueTensor[i] <= 359 for each image in batch) * \param [in] saturationTensor saturation modification parameter for color_jitter calculation (1D tensor in HOST memory, of size batchSize with saturationTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -257,7 +257,7 @@ RppStatus rppt_color_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch) * \param [in] alphaTensor alpha values for color casting calculation (1D tensor in HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -279,7 +279,7 @@ RppStatus rppt_color_cast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] rgbTensor R/G/B values for color casting calculation (2D tensor in pinned/HOST memory, of size sizeof(RpptRGB) * batchSize with 0 <= rgbTensor[n]. <= 255 for each image in batch) * \param [in] alphaTensor alpha values for color casting calculation (1D tensor in pinned/HOST memory, of size sizeof(Rpp32f) * batchSize with alphaTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -300,7 +300,7 @@ RppStatus rppt_color_cast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -321,7 +321,7 @@ RppStatus rppt_exposure_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] exposureFactorTensor exposure factor values for exposure adjustment (1D tensor in pinned/HOST memory, of size batchSize, with exposureFactorTensor[n] >= 0 for each image in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -343,7 +343,7 @@ RppStatus rppt_exposure_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch)) * \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -365,7 +365,7 @@ RppStatus rppt_contrast_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] contrastFactorTensor contrast factor values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize with contrastFactorTensor[n] > 0 for each image in a batch)) * \param [in] contrastCenterTensor contrast center values for contrast calculation (1D tensor in pinned/HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -386,7 +386,7 @@ RppStatus rppt_contrast_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] lutPtr lut Array in HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -407,7 +407,7 @@ RppStatus rppt_lut_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] lutPtr lut Array in pinned/HOST memory, containing a single integer look up table of length 65536, to be used for all images in the batch - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -428,7 +428,7 @@ RppStatus rppt_lut_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -449,7 +449,7 @@ RppStatus rppt_color_temperature_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] adjustmentValueTensor adjustment values for color temperature calculation (1D tensor of size batchSize with -100 <= adjustmentValueTensor[i] >= 100 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h index 8b62d61f5..b185e0081 100644 --- a/include/rppt_tensor_effects_augmentations.h +++ b/include/rppt_tensor_effects_augmentations.h @@ -56,7 +56,7 @@ extern "C" { * \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch) * \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch) * \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -80,7 +80,7 @@ RppStatus rppt_gridmask_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [in] gridRatio gridRatio value for gridmask calculation = black square width / tileWidth (a single Rpp32f number with 0 <= gridRatio <= 1 that applies to all images in the batch) * \param [in] gridAngle gridAngle value for gridmask calculation = grid rotation angle in radians (a single Rpp32f number that applies to all images in the batch) * \param [in] translateVector translateVector for gridmask calculation = grid X and Y translation lengths in pixels (a single RpptUintVector2D x,y value pair that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -103,7 +103,7 @@ RppStatus rppt_gridmask_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -126,7 +126,7 @@ RppStatus rppt_spatter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] spatterColor RGB values to use for the spatter augmentation (A single set of 3 Rpp8u values as RpptRGB that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 1920 and roiTensorSrc[i].xywhROI.roiHeight <= 1080) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -151,7 +151,7 @@ RppStatus rppt_spatter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] saltValueTensor A user-defined salt noise value (1D tensor in HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch) * \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -176,7 +176,7 @@ RppStatus rppt_salt_and_pepper_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt * \param [in] saltValueTensor A user-defined salt noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= saltValueTensor[i] <= 1 for each image in batch) * \param [in] pepperValueTensor A user-defined pepper noise value (1D tensor in pinned/HOST memory, of size batchSize with 0 <= pepperValueTensor[i] <= 1 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -198,7 +198,7 @@ RppStatus rppt_salt_and_pepper_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -220,7 +220,7 @@ RppStatus rppt_shot_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] shotNoiseFactorTensor shotNoiseFactor values for each image, which are used to compute the lambda values in a poisson distribution (1D tensor in pinned/HOST memory, of size batchSize with shotNoiseFactorTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -243,7 +243,7 @@ RppStatus rppt_shot_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -266,7 +266,7 @@ RppStatus rppt_gaussian_noise_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP * \param [in] meanTensor mean values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with meanTensor[i] >= 0 for each image in batch) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) * \param [in] seed A user-defined seed value (single Rpp32u value) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -289,7 +289,7 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -312,7 +312,7 @@ RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDes * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] stdDevTensor stdDev values for each image, which are used to compute the generalized Box-Mueller transforms in a gaussian distribution (1D tensor in pinned/HOST memory, of size batchSize with stdDevTensor[i] >= 0 for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -338,7 +338,7 @@ RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDesc * \param[in] freqYTensor freqY values for water effect (1D tensor in HOST memory, of size batchSize) * \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize) * \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -364,7 +364,7 @@ RppStatus rppt_water_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param[in] freqYTensor freqY values for water effect (1D tensor in pinned/HOST memory, of size batchSize) * \param[in] phaseXTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize) * \param[in] phaseYTensor amplitudeY values for water effect (1D tensor in pinned/HOST memory, of size batchSize) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -433,7 +433,7 @@ RppStatus rppt_ricap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -455,7 +455,7 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param[in] vignetteIntensityTensor intensity values to quantify vignette effect (1D tensor of size batchSize with 0 < vignetteIntensityTensor[n] for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -465,6 +465,50 @@ RppStatus rppt_vignette_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *vignetteIntensityTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/******************** jitter ********************/ + +/*! \brief Jitter augmentation on HOST backend for a NCHW/NHWC layout tensor + * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input + * \image html effects_augmentations_jitter_img150x150.png Sample Output + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use) + * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Jitter augmentation on HIP backend for a NCHW/NHWC layout tensor + * \details The jitter augmentation adds a jitter effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). + * - dstPtr depth ranges - Will be same depth as srcPtr. + * \image html img150x150.png Sample Input + * \image html effects_augmentations_jitter_img150x150.png Sample Output + * \param [in] srcPtr source tensor in HIP memory + * \param un[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) + * \param [in] kernelSizeTensor kernelsize value for jitter calculation (kernelSize = 3/5/7 for optimal use) + * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32u *kernelSizeTensor, Rpp32u seed, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! \brief Gaussian noise augmentation on HOST backend * \details This function adds gaussian noise to a batch of 4D tensors. * Support added for u8 -> u8, f32 -> f32 datatypes. @@ -524,7 +568,7 @@ RppStatus rppt_gaussian_noise_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcD - Erase-region anchor boxes on each image given by the user must not overlap * \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr) * \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -549,7 +593,7 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP - Erase-region anchor boxes on each image given by the user must not overlap * \param [in] colorsTensor RGB values to use for each erase-region inside each image in the batch. (colors[i] will have range equivalent of srcPtr) * \param [in] numBoxesTensor number of erase-regions per image, for each image in the batch. (numBoxesTensor[n] >= 0) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -559,13 +603,6 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptRoiLtrb *anchorBoxInfoTensor, RppPtr_t colorsTensor, Rpp32u *numBoxesTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT -/*! @} - */ - -#ifdef __cplusplus -} -#endif - /*! \brief Glitch augmentation on HOST backend for a NCHW/NHWC layout tensor * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). @@ -578,7 +615,7 @@ RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A single set of 3 Rppi point values that applies to all images in the batch. * For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height) - * \param [in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -600,7 +637,7 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] rgbOffsets RGB offset values to use for the glitch augmentation (A 1D tensor in pinned/HOST memory contains single set of 3 Rppi point values that applies to all images in the batch. * For each point and for each image in the batch: 0 < point.x < width, 0 < point.y < height) - * \param [in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -609,4 +646,11 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst */ RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptChannelOffsets *rgbOffsets, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT + +/*! @} + */ + +#ifdef __cplusplus +} +#endif #endif // RPPT_TENSOR_EFFECTS_AUGMENTATIONS_H diff --git a/include/rppt_tensor_filter_augmentations.h b/include/rppt_tensor_filter_augmentations.h index 7ea8d00c6..992631c49 100644 --- a/include/rppt_tensor_filter_augmentations.h +++ b/include/rppt_tensor_filter_augmentations.h @@ -57,7 +57,7 @@ extern "C" { * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -83,7 +83,7 @@ RppStatus rppt_box_filter_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] stdDevTensor stdDev values for gaussian calculation (1D tensor in pinned/HOST memory, of size batchSize, for each image in batch) * \param [in] kernelSize kernel size for gaussian filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h index 884127a71..6da067844 100644 --- a/include/rppt_tensor_geometric_augmentations.h +++ b/include/rppt_tensor_geometric_augmentations.h @@ -52,7 +52,7 @@ extern "C" { * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -72,7 +72,7 @@ RppStatus rppt_crop_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -95,7 +95,7 @@ RppStatus rppt_crop_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr * \param [in] offsetTensor offset values for normalization (1D tensor in HOST memory, of size batchSize, with offsetTensor[n] <= 0) * \param [in] multiplierTensor multiplier values for normalization (1D tensor in HOST memory, of size batchSize, with multiplierTensor[n] > 0) * \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -118,7 +118,7 @@ RppStatus rppt_crop_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPt * \param [in] offsetTensor offset values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with offsetTensor[n] <= 0) * \param [in] multiplierTensor multiplier values for normalization (1D tensor in pinned/HOST memory, of size batchSize, with multiplierTensor[n] > 0) * \param [in] mirrorTensor mirror flag values to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -140,7 +140,7 @@ RppStatus rppt_crop_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in HOST memory, of size batchSize * 6 for each image in batch) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -162,7 +162,7 @@ RppStatus rppt_warp_affine_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] affineTensor affine matrix values for transformation calculation (2D tensor in pinned/HOST memory, of size batchSize * 6 for each image in batch) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -184,7 +184,7 @@ RppStatus rppt_warp_affine_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in HOST memory, of size batchSize, with verticalTensor[i] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -206,7 +206,7 @@ RppStatus rppt_flip_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] horizontalTensor horizontal flag values to set horizontal flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with horizontalTensor[i] = 0/1) * \param [in] verticalTensor vertical flag values to set vertical flip on/off (1D tensor in pinned/HOST memory, of size batchSize, with verticalTensor[i] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -228,7 +228,7 @@ RppStatus rppt_flip_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -250,7 +250,7 @@ RppStatus rppt_resize_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -275,7 +275,7 @@ RppStatus rppt_resize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)) * \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images) * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -300,7 +300,7 @@ RppStatus rppt_resize_mirror_normalize_host(RppPtr_t srcPtr, RpptDescPtr srcDesc * \param [in] meanTensor mean value for each image in the batch (meanTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images)) * \param [in] stdDevTensor standard deviation value for each image in the batch (stdDevTensor[n] >= 0, 1D tensor in pinned/HOST memory, of size = batchSize for greyscale images, size = batchSize * 3 for RGB images) * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -323,7 +323,7 @@ RppStatus rppt_resize_mirror_normalize_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescP * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -346,7 +346,7 @@ RppStatus rppt_resize_crop_mirror_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, * \param [in] dstImgSizes destination image sizes ( \ref RpptImagePatchPtr type pointer to array, in pinned/HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType * \param [in] mirrorTensor mirror flag value to set mirroring on/off (1D tensor in pinned/HOST memory, of size batchSize, with mirrorTensor[n] = 0/1) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -368,7 +368,7 @@ RppStatus rppt_resize_crop_mirror_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, R * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -390,7 +390,7 @@ RppStatus rppt_rotate_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] angle image rotation angle in degrees - positive deg-anticlockwise/negative deg-clockwise (1D tensor in pinned/HOST memory, of size batchSize) * \param [in] interpolationType Interpolation type used (RpptInterpolationType::XYWH or RpptRoiType::LTRB) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -412,7 +412,7 @@ RppStatus rppt_rotate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -434,7 +434,7 @@ RppStatus rppt_phase_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) @@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) @@ -598,7 +598,7 @@ RppStatus rppt_flip_voxel_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDesc * \param [in] colRemapTable Rpp32f column numbers in HOST memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc) * \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -623,7 +623,7 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \param [in] colRemapTable Rpp32f column numbers in HIP memory for every pixel in the input batch of images (Restrictions - rois in the colRemapTable data for each image in batch must match roiTensorSrc) * \param [in] tableDescPtr rowRemapTable and colRemapTable common tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) * \param [in] interpolationType Interpolation type used in \ref RpptInterpolationType (Restrictions - Supports only NEAREST_NEIGHBOR and BILINEAR) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -650,7 +650,7 @@ RppStatus rppt_remap_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize) * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -677,7 +677,7 @@ RppStatus rppt_lens_correction_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp * \param [in] tableDescPtr table tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = F32, layout = NHWC, c = 1) * \param [in] cameraMatrixTensor contains camera intrinsic parameters required to compute lens corrected image. (1D tensor of size 9 * batchSize) * \param [in] distortionCoeffsTensor contains distortion coefficients required to compute lens corrected image. (1D tensor of size 8 * batchSize) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -728,4 +728,4 @@ RppStatus rppt_transpose_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_GEOMETRIC_AUGMENTATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_GEOMETRIC_AUGMENTATIONS_H diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h index 3a4685167..28dff69ce 100644 --- a/include/rppt_tensor_logical_operations.h +++ b/include/rppt_tensor_logical_operations.h @@ -54,7 +54,7 @@ extern "C" { * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -76,7 +76,7 @@ RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -98,7 +98,7 @@ RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -120,7 +120,7 @@ RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr s * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -136,4 +136,4 @@ RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H diff --git a/include/rppt_tensor_morphological_operations.h b/include/rppt_tensor_morphological_operations.h index eb879af5c..126c4757a 100644 --- a/include/rppt_tensor_morphological_operations.h +++ b/include/rppt_tensor_morphological_operations.h @@ -57,7 +57,7 @@ extern "C" { * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -82,7 +82,7 @@ RppStatus rppt_erode_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \param [out] dstPtr destination tensor in HIP memory * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr) * \param [in] kernelSize kernel size for box filter (a single Rpp32u odd number with kernelSize = 3/5/7/9 that applies to all images in the batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -98,4 +98,4 @@ RppStatus rppt_dilate_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_MORPHOLOGICAL_OPERATIONS_H diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h index 441816ea3..ca464340b 100644 --- a/include/rppt_tensor_statistical_operations.h +++ b/include/rppt_tensor_statistical_operations.h @@ -50,7 +50,7 @@ extern "C" { * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorSumArr destination array in HOST memory * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -68,7 +68,7 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorSumArr destination array in HIP memory * \param [in] tensorSumArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -86,7 +86,7 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] minArr destination array in HOST memory * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -104,7 +104,7 @@ RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] minArr destination array in HIP memory * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -122,7 +122,7 @@ RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] maxArr destination array in HOST memory * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -140,7 +140,7 @@ RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] maxArr destination array in HIP memory * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -201,7 +201,7 @@ RppStatus rppt_normalize_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescP * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorMeanArr destination array in HOST memory * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -219,7 +219,7 @@ RppStatus rppt_tensor_mean_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] tensorMeanArr destination array in HIP memory * \param [in] tensorMeanArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorMeanArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorMeanArrLength = srcDescPtr->n * 4) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -238,7 +238,7 @@ RppStatus rppt_tensor_mean_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t * \param [out] tensorStddevArr destination array in HOST memory * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4) * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch) - * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() * \return A \ref RppStatus enumeration. @@ -257,7 +257,7 @@ RppStatus rppt_tensor_stddev_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt * \param [out] tensorStddevArr destination array in HIP memory * \param [in] tensorStddevArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorStddevArrLength = srcDescPtr->n, and if srcDescPtr->c == 3 then tensorStddevArrLength = srcDescPtr->n * 4) * \param [in] meanTensor mean values for stddev calculation (1D tensor of size batchSize * 4 in format (MeanR, MeanG, MeanB, MeanImage) for each image in batch) - * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) + * \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160) * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() * \return A \ref RppStatus enumeration. @@ -273,4 +273,4 @@ RppStatus rppt_tensor_stddev_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr #ifdef __cplusplus } #endif -#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H \ No newline at end of file +#endif // RPPT_TENSOR_STATISTICAL_OPERATIONS_H diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 779f6f2d1..be8eaeeaa 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -6111,6 +6111,25 @@ inline void compute_separable_horizontal_resample(Rpp32f *inputPtr, T *outputPtr } } +inline void compute_jitter_src_loc_avx(__m256i *pxXorwowStateX, __m256i *pxXorwowStateCounter, __m256 &pRow, __m256 &pCol, __m256 &pKernelSize, __m256 &pBound, __m256 &pHeightLimit, __m256 &pWidthLimit, __m256 &pStride, __m256 &pChannel, Rpp32s *srcLoc) +{ + __m256 pRngX = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter); + __m256 pRngY = rpp_host_rng_xorwow_8_f32_avx(pxXorwowStateX, pxXorwowStateCounter); + __m256 pX = _mm256_mul_ps(pRngX, pKernelSize); + __m256 pY = _mm256_mul_ps(pRngY, pKernelSize); + pX = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pRow, _mm256_sub_ps(pX, pBound))), pHeightLimit), avx_p0); + pY = _mm256_max_ps(_mm256_min_ps(_mm256_floor_ps(_mm256_add_ps(pCol, _mm256_sub_ps(pY, pBound))), pWidthLimit), avx_p0); + __m256i pxSrcLoc = _mm256_cvtps_epi32(_mm256_fmadd_ps(pX, pStride, _mm256_mul_ps(pY, pChannel))); + _mm256_storeu_si256((__m256i*) srcLoc, pxSrcLoc); +} + +inline void compute_jitter_src_loc(RpptXorwowStateBoxMuller *xorwowState, Rpp32s row, Rpp32s col, Rpp32s kSize, Rpp32s heightLimit, Rpp32s widthLimit, Rpp32s stride, Rpp32s bound, Rpp32s channels, Rpp32s &loc) +{ + Rpp32u heightIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize; + Rpp32u widthIncrement = rpp_host_rng_xorwow_f32(xorwowState) * kSize; + loc = std::max(std::min(static_cast(row + heightIncrement - bound), heightLimit), 0) * stride; + loc += std::max(std::min(static_cast(col + widthIncrement - bound), (widthLimit - 1)), 0) * channels; +} inline void compute_sum_16_host(__m256i *p, __m256i *pSum) { pSum[0] = _mm256_add_epi32(_mm256_add_epi32(p[0], p[1]), pSum[0]); //add 16 values to 8 diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index bd7da2a5d..b9e79c146 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -3859,6 +3859,20 @@ inline void rpp_resize_nn_load_u8pkd3(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _ p = _mm_shuffle_epi8(px[0], xmm_pkd_mask); // Shuffle to obtain 4 RGB [R01|G01|B01|R11|G11|B11|R21|G21|B21|R31|G31|B31|00|00|00|00] } +template +inline void rpp_resize_nn_extract_pkd3_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p) +{ + p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[0] + 1), *(srcRowPtrsForInterp + loc[0] + 2), + *(srcRowPtrsForInterp + loc[1]), *(srcRowPtrsForInterp + loc[1] + 1), *(srcRowPtrsForInterp + loc[1] + 2), + *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[2] + 1), *(srcRowPtrsForInterp + loc[2] + 2), + *(srcRowPtrsForInterp + loc[3]), *(srcRowPtrsForInterp + loc[3] + 1), *(srcRowPtrsForInterp + loc[3] + 2), + *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[4] + 1), *(srcRowPtrsForInterp + loc[4] + 2), + *(srcRowPtrsForInterp + loc[5]), *(srcRowPtrsForInterp + loc[5] + 1), *(srcRowPtrsForInterp + loc[5] + 2), + *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[6] + 1), *(srcRowPtrsForInterp + loc[6] + 2), + *(srcRowPtrsForInterp + loc[7]), *(srcRowPtrsForInterp + loc[7] + 1), *(srcRowPtrsForInterp + loc[7] + 2), + 0, 0, 0, 0, 0, 0, 0, 0); +} + inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p) { __m128i px[4]; @@ -3871,6 +3885,16 @@ inline void rpp_resize_nn_load_u8pln1(Rpp8u *srcRowPtrsForInterp, Rpp32s *loc, _ p = _mm_unpacklo_epi8(px[0], px[1]); // unpack to obtain [R01|R11|R21|R31|00|00|00|00|00|00|00|00|00|00|00|00] } +template +inline void rpp_resize_nn_extract_pln1_avx(T *srcRowPtrsForInterp, Rpp32s *loc, __m256i &p) +{ + p = _mm256_setr_epi8(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]), + *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]), + *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]), + *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7]), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 *p) { p[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]); // LOC0 load [R01|G01|B01|R02] - Need RGB 01 @@ -3880,6 +3904,42 @@ inline void rpp_resize_nn_load_f32pkd3_to_f32pln3(Rpp32f *srcRowPtrsForInterp, R _MM_TRANSPOSE4_PS(p[0], p[1], p[2], pTemp); // Transpose to obtain RGB in each vector } +inline void rpp_resize_nn_load_f32pkd3_to_f32pln3_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p) +{ + __m128 p128[8]; + p128[0] = _mm_loadu_ps(srcRowPtrsForInterp + loc[0]); + p128[1] = _mm_loadu_ps(srcRowPtrsForInterp + loc[1]); + p128[2] = _mm_loadu_ps(srcRowPtrsForInterp + loc[2]); + p128[3] = _mm_loadu_ps(srcRowPtrsForInterp + loc[3]); + _MM_TRANSPOSE4_PS(p128[0], p128[1], p128[2], p128[3]); + p128[4] = _mm_loadu_ps(srcRowPtrsForInterp + loc[4]); + p128[5] = _mm_loadu_ps(srcRowPtrsForInterp + loc[5]); + p128[6] = _mm_loadu_ps(srcRowPtrsForInterp + loc[6]); + p128[7] = _mm_loadu_ps(srcRowPtrsForInterp + loc[7]); + _MM_TRANSPOSE4_PS(p128[4], p128[5], p128[6], p128[7]); + p[0] = _mm256_setr_m128(p128[0], p128[4]); + p[1] = _mm256_setr_m128(p128[1], p128[5]); + p[2] = _mm256_setr_m128(p128[2], p128[6]); +} + +inline void rpp_resize_nn_load_f16pkd3_to_f32pln3_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 *p) +{ + p[0] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]), + (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]), + (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]), + (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7])); + + p[1] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 1), + (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 1), + (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 1), + (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 1), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 1)); + + p[2] = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[1] + 2), + (Rpp32f)*(srcRowPtrsForInterp + loc[2] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[3] + 2), + (Rpp32f)*(srcRowPtrsForInterp + loc[4] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[5] + 2), + (Rpp32f)*(srcRowPtrsForInterp + loc[6] + 2), (Rpp32f)*(srcRowPtrsForInterp + loc[7] + 2)); +} + inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m128 &p) { __m128 pTemp[4]; @@ -3892,6 +3952,22 @@ inline void rpp_resize_nn_load_f32pln1(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, p = _mm_unpacklo_ps(pTemp[0], pTemp[1]); // Unpack to obtain [R01|R11|R21|R31] } +inline void rpp_resize_nn_load_f32pln1_avx(Rpp32f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p) +{ + p = _mm256_setr_ps(*(srcRowPtrsForInterp + loc[0]), *(srcRowPtrsForInterp + loc[1]), + *(srcRowPtrsForInterp + loc[2]), *(srcRowPtrsForInterp + loc[3]), + *(srcRowPtrsForInterp + loc[4]), *(srcRowPtrsForInterp + loc[5]), + *(srcRowPtrsForInterp + loc[6]), *(srcRowPtrsForInterp + loc[7])); +} + +inline void rpp_resize_nn_load_f16pln1_avx(Rpp16f *srcRowPtrsForInterp, Rpp32s *loc, __m256 &p) +{ + p = _mm256_setr_ps((Rpp32f)*(srcRowPtrsForInterp + loc[0]), (Rpp32f)*(srcRowPtrsForInterp + loc[1]), + (Rpp32f)*(srcRowPtrsForInterp + loc[2]), (Rpp32f)*(srcRowPtrsForInterp + loc[3]), + (Rpp32f)*(srcRowPtrsForInterp + loc[4]), (Rpp32f)*(srcRowPtrsForInterp + loc[5]), + (Rpp32f)*(srcRowPtrsForInterp + loc[6]), (Rpp32f)*(srcRowPtrsForInterp + loc[7])); +} + inline void rpp_resize_nn_load_i8pkd3(Rpp8s *srcRowPtrsForInterp, Rpp32s *loc, __m128i &p) { __m128i px[4]; diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 16e3a2765..721800c80 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -1944,7 +1944,8 @@ __device__ __forceinline__ float rpp_hip_rng_xorwow_f32(T *xorwowState) return outFloat - 1; // return 0 <= outFloat < 1 } -__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(RpptXorwowState *xorwowState, d_float8 *randomNumbersPtr_f8) +template +__device__ __forceinline__ void rpp_hip_rng_8_xorwow_f32(T *xorwowState, d_float8 *randomNumbersPtr_f8) { randomNumbersPtr_f8->f1[0] = rpp_hip_rng_xorwow_f32(xorwowState); randomNumbersPtr_f8->f1[1] = rpp_hip_rng_xorwow_f32(xorwowState); diff --git a/src/modules/cpu/host_tensor_effects_augmentations.hpp b/src/modules/cpu/host_tensor_effects_augmentations.hpp index 56d5ea817..ce7450aab 100644 --- a/src/modules/cpu/host_tensor_effects_augmentations.hpp +++ b/src/modules/cpu/host_tensor_effects_augmentations.hpp @@ -31,6 +31,7 @@ SOFTWARE. #include "kernel/noise_shot.hpp" #include "kernel/noise_gaussian.hpp" #include "kernel/non_linear_blend.hpp" +#include "kernel/jitter.hpp" #include "kernel/glitch.hpp" #include "kernel/water.hpp" #include "kernel/ricap.hpp" diff --git a/src/modules/cpu/kernel/jitter.hpp b/src/modules/cpu/kernel/jitter.hpp new file mode 100644 index 000000000..ec717150a --- /dev/null +++ b/src/modules/cpu/kernel/jitter.hpp @@ -0,0 +1,929 @@ +#include "rppdefs.h" +#include "rpp_cpu_simd.hpp" +#include "rpp_cpu_common.hpp" + +RppStatus jitter_u8_u8_host_tensor(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp8u *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp8u *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth - 1); + __m256 pBound = _mm256_set1_ps(bound); + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_u8pkd3_to_u8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow); + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, srcDescPtr->c, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_u8pln3_to_u8pkd3_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8u *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_u8_to_u8_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRow + loc); + *dstPtrTemp++ = *(srcPtrRow + 1 + loc); + *dstPtrTemp++ = *(srcPtrRow + 2 + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8u *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8u *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp8u *dstPtrTempChn, *srcPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + for(int c = 0; c < srcDescPtr->c; c++) + { + __m256i pxRow; + rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow); + rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow)); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp8u *dstPtrTempChn = dstPtrTemp; + Rpp8u *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = *(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus jitter_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1); + __m256 pBound = _mm256_set1_ps(bound); + + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f32pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow); + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256 pxRow[4]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]); + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]); + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + __m256 pRow; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + rpp_simd_load(rpp_load8_f32_to_f32_avx, (srcPtrChannel + loc), &pRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, &pRow); + dstPtrTemp += 3; + } +#endif + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32f *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp32f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f *srcPtrTempChn, *dstPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + + for (int c = 0; c < dstDescPtr->c; c++) + { + __m256 pxRow; + rpp_simd_load(rpp_resize_nn_load_f32pln1_avx, srcPtrTempChn, srcLocArray, pxRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTempChn, &pxRow); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32f *dstPtrTempChn = dstPtrTemp; + Rpp32f *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = (Rpp32f)*(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus jitter_f16_f16_host_tensor(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp16f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp16f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1); + __m256 pBound = _mm256_set1_ps(bound); + + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f dstPtrTempR_ps[8], dstPtrTempG_ps[8], dstPtrTempB_ps[8]; + __m256 pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f16pkd3_to_f32pln3_avx, srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, pxRow); + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + dstPtrTempR[cnt] = (Rpp16f) dstPtrTempR_ps[cnt]; + dstPtrTempG[cnt] = (Rpp16f) dstPtrTempG_ps[cnt]; + dstPtrTempB[cnt] = (Rpp16f) dstPtrTempB_ps[cnt]; + } + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp32f dstPtrTemp_ps[25]; + __m256 pxRow[4]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowR, srcLocArray, pxRow[0]); + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowG, srcLocArray, pxRow[1]); + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, pxRow); + for(int cnt = 0; cnt < vectorIncrement; cnt++) + dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp16f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32f srcPtrTemp_ps[8], dstPtrTemp_ps[8]; + Rpp32s loc; + __m256 pRow; + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + srcPtrTemp_ps[cnt] = (Rpp16f)srcPtrChannel[loc + cnt]; + } + + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &pRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pRow); + + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + dstPtrTemp[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + } + dstPtrTemp += 3; + } +#endif + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp16f *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp16f *dstPtrTemp; + dstPtrTemp = dstPtrRow; + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp16f *srcPtrTempChn, *dstPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + + for (int c = 0; c < dstDescPtr->c; c++) + { + Rpp32f dstPtrTemp_ps[8]; + __m256 pxRow; + rpp_simd_load(rpp_resize_nn_load_f16pln1_avx, srcPtrTempChn, srcLocArray, pxRow); + rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, &pxRow); + for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++) + { + dstPtrTempChn[cnt] = (Rpp16f) dstPtrTemp_ps[cnt]; + } + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp16f *dstPtrTempChn = dstPtrTemp; + Rpp16f *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = (Rpp16f)*(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus jitter_i8_i8_host_tensor(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams, + rpp::Handle& handle) +{ + RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h}; + Rpp32u numThreads = handle.GetNumThreads(); + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType); + + Rpp32u kernelSize = kernelSizeTensor[batchCount]; + Rpp32u bound = (kernelSize - 1) / 2; + Rpp32u heightLimit = roi.xywhROI.roiHeight - bound; + Rpp32u offset = batchCount * srcDescPtr->strides.nStride; + + Rpp8s *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp8s *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + Rpp32u alignedLength = roi.xywhROI.roiWidth & ~7; // Align dst width to process 4 dst pixels per iteration + Rpp32u vectorIncrement = 24; + Rpp32u vectorIncrementPerChannel = 8; + RpptXorwowStateBoxMuller xorwowState; + Rpp32s srcLocArray[8] = {0}; + + __m256i pxXorwowStateX[5], pxXorwowStateCounter; + rpp_host_rng_xorwow_state_offsetted_avx(xorwowInitialStatePtr, xorwowState, offset, pxXorwowStateX, &pxXorwowStateCounter); + __m256 pKernelSize = _mm256_set1_ps(kernelSize); + __m256 pChannel = _mm256_set1_ps(layoutParams.bufferMultiplier); + __m256 pHStride = _mm256_set1_ps(srcDescPtr->strides.hStride); + __m256 pHeightLimit = _mm256_set1_ps(heightLimit); + __m256 pWidthLimit = _mm256_set1_ps(roi.xywhROI.roiWidth-1); + __m256 pBound = _mm256_set1_ps(bound); + + // Jitter with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrChannel, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_i8pkd3_to_i8pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, pxRow); + dstPtrTempR += vectorIncrementPerChannel; + dstPtrTempG += vectorIncrementPerChannel; + dstPtrTempB += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTempR++ = *(srcPtrChannel + loc); + *dstPtrTempG++ = *(srcPtrChannel + 1 + loc); + *dstPtrTempB++ = *(srcPtrChannel + 2 + loc); + } + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Jitter with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow[3]; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pln1_avx(srcPtrRowR, srcLocArray, pxRow[0]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowG, srcLocArray, pxRow[1]); + rpp_resize_nn_extract_pln1_avx(srcPtrRowB, srcLocArray, pxRow[2]); + rpp_simd_store(rpp_store24_i8pln3_to_i8pkd3_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = *(srcPtrRowR + loc); + *dstPtrTemp++ = *(srcPtrRowG + loc); + *dstPtrTemp++ = *(srcPtrRowB + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Jitter without fused output-layout toggle (NHWC -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp8s *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + __m256i pxRow; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + rpp_resize_nn_extract_pkd3_avx(srcPtrRow, srcLocArray, pxRow); + rpp_simd_store(rpp_store24_i8_to_i8_avx, dstPtrTemp, pxRow); + dstPtrTemp += vectorIncrement; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (; vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + loc); + *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 1 + loc); + *dstPtrTemp++ = (Rpp8s)*(srcPtrRow + 2 + loc); + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + // Jitter with fused output-layout toggle (NCHW -> NCHW) + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp8s *dstPtrRow; + dstPtrRow = dstPtrChannel; + for(int dstLocRow = 0; dstLocRow < roi.xywhROI.roiHeight; dstLocRow++) + { + Rpp8s *dstPtrTemp; + dstPtrTemp = dstPtrRow; + + __m256 pRow = _mm256_set1_ps(dstLocRow); + __m256 pCol = avx_pDstLocInit; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel) + { + Rpp8s *dstPtrTempChn, *srcPtrTempChn; + srcPtrTempChn = srcPtrChannel; + dstPtrTempChn = dstPtrTemp; + compute_jitter_src_loc_avx(pxXorwowStateX, &pxXorwowStateCounter, pRow, pCol, pKernelSize, pBound, pHeightLimit, pWidthLimit, pHStride, pChannel, srcLocArray); + for(int c = 0; c < srcDescPtr->c; c++) + { + __m256i pxRow; + rpp_resize_nn_extract_pln1_avx(srcPtrTempChn, srcLocArray, pxRow); + rpp_storeu_si64((__m128i *)(dstPtrTempChn), _mm256_castsi256_si128(pxRow)); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp += vectorIncrementPerChannel; + pCol = _mm256_add_ps(avx_p8, pCol); + } +#endif + for (;vectorLoopCount < roi.xywhROI.roiWidth; vectorLoopCount++) + { + Rpp8s *dstPtrTempChn = dstPtrTemp; + Rpp8s *srcPtrTempChn = srcPtrChannel; + Rpp32s loc; + compute_jitter_src_loc(&xorwowState, dstLocRow, vectorLoopCount, kernelSize, heightLimit, roi.xywhROI.roiWidth, srcDescPtr->strides.hStride, bound, layoutParams.bufferMultiplier, loc); + for(int c = 0; c < srcDescPtr->c; c++) + { + *dstPtrTempChn = (Rpp8s)*(srcPtrTempChn + loc); + srcPtrTempChn += srcDescPtr->strides.cStride; + dstPtrTempChn += dstDescPtr->strides.cStride; + } + dstPtrTemp++; + } + dstPtrRow += dstDescPtr->strides.hStride; + } + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp index 42e72db98..08eb93674 100644 --- a/src/modules/hip/handlehip.cpp +++ b/src/modules/hip/handlehip.cpp @@ -239,7 +239,12 @@ struct HandleImpl } hipMalloc(&(this->initHandle->mem.mgpu.rgbArr.rgbmem), sizeof(RpptRGB) * this->nBatchSize); - hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 8294400); // 3840 x 2160 + + /* (600000 + 293 + 128) * 128 - Maximum scratch memory required for Non Silent Region Detection HIP kernel used in RNNT training (uses a batchsize 128) + - 600000 is the maximum size that will be required for MMS buffer based on Librispeech dataset + - 293 is the size required for storing reduction outputs for 600000 size sample + - 128 is the size required for storing cutOffDB values for batch size 128 */ + hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888); } }; diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp index 53b80c8ee..a5b83715b 100644 --- a/src/modules/hip/hip_tensor_audio_augmentations.hpp +++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp @@ -25,6 +25,7 @@ SOFTWARE. #ifndef HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP #define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP +#include "kernel/non_silent_region_detection.hpp" #include "kernel/to_decibels.hpp" #endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP diff --git a/src/modules/hip/hip_tensor_effects_augmentations.hpp b/src/modules/hip/hip_tensor_effects_augmentations.hpp index f1da2cdb9..12e80a1f4 100644 --- a/src/modules/hip/hip_tensor_effects_augmentations.hpp +++ b/src/modules/hip/hip_tensor_effects_augmentations.hpp @@ -31,6 +31,7 @@ SOFTWARE. #include "kernel/noise_shot.hpp" #include "kernel/noise_gaussian.hpp" #include "kernel/non_linear_blend.hpp" +#include "kernel/jitter.hpp" #include "kernel/glitch.hpp" #include "kernel/water.hpp" #include "kernel/ricap.hpp" diff --git a/src/modules/hip/kernel/jitter.hpp b/src/modules/hip/kernel/jitter.hpp new file mode 100644 index 000000000..bbc407cda --- /dev/null +++ b/src/modules/hip/kernel/jitter.hpp @@ -0,0 +1,314 @@ +#include +#include "rpp_hip_common.hpp" +#include "rng_seed_stream.hpp" + +__device__ __forceinline__ void jitter_roi_and_srclocs_hip_compute(int4 *srcRoiPtr_i4, RpptXorwowStateBoxMuller *xorwowState, uint kernelSize, uint bound, int id_x, int id_y, d_float16 *locSrc_f16) +{ + d_float8 widthIncrement_f8, heightIncrement_f8; + rpp_hip_rng_8_xorwow_f32(xorwowState, &widthIncrement_f8); + rpp_hip_math_multiply8_const(&widthIncrement_f8, &widthIncrement_f8, static_cast(kernelSize)); + rpp_hip_rng_8_xorwow_f32(xorwowState, &heightIncrement_f8); + rpp_hip_math_multiply8_const(&heightIncrement_f8, &heightIncrement_f8, static_cast(kernelSize)); + + d_float8 increment_f8, locDst_f8x, locDst_f8y; + increment_f8.f4[0] = make_float4(0.0f, 1.0f, 2.0f, 3.0f); // 8 element vectorized kernel needs 8 increments - creating uint4 for increments 0, 1, 2, 3 here, and adding (float4)4 later to get 4, 5, 6, 7 incremented srcLocs + increment_f8.f4[1] = make_float4(4.0f, 5.0f, 6.0f, 7.0f); + locDst_f8x.f4[0] = static_cast(id_x) + increment_f8.f4[0]; + locDst_f8x.f4[1] = static_cast(id_x) + increment_f8.f4[1]; + locDst_f8y.f4[0] = locDst_f8y.f4[1] = (float4)id_y; + + locSrc_f16->f8[0].f4[0] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[0] + widthIncrement_f8.f4[0] - static_cast(bound); + locSrc_f16->f8[0].f4[1] = static_cast(srcRoiPtr_i4->x) + locDst_f8x.f4[1] + widthIncrement_f8.f4[1] - static_cast(bound); + locSrc_f16->f8[1].f4[0] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[0] + heightIncrement_f8.f4[0] - static_cast(bound); + locSrc_f16->f8[1].f4[1] = static_cast(srcRoiPtr_i4->y) + locDst_f8y.f4[1] + heightIncrement_f8.f4[1] - static_cast(bound); + + // Apply boundary checks and adjustments + for(int i = 0; i < 8; ++i) + { + locSrc_f16->f1[i] = fmaxf(fminf(floorf(locSrc_f16->f1[i]), static_cast(srcRoiPtr_i4->z - 1)), 0.0f); + locSrc_f16->f1[i + 8] = fmaxf(fminf(floorf(locSrc_f16->f1[i + 8]), static_cast(srcRoiPtr_i4->w - bound)), 0.0f); + } +} + +template +__global__ void jitter_pkd_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint2 dstStridesNH, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3); + uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float24 dst_f24; + rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24); + rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +__global__ void jitter_pln_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint3 dstStridesNCH, + int channelsDst, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float8 dst_f8; + rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + + srcIdx += srcStridesNCH.y; + dstIdx += dstStridesNCH.y; + + rpp_hip_interpolate8_nearest_neighbor_pln1(srcPtr + srcIdx, srcStridesNCH.z, &locSrc_f16, &srcRoi_i4, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); + } +} + +template +__global__ void jitter_pkd3_pln3_tensor(T *srcPtr, + uint2 srcStridesNH, + T *dstPtr, + uint3 dstStridesNCH, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNH.x); + uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x; + uint seedStreamIdx = (id_y * dstStridesNCH.z) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float24 dst_f24; + rpp_hip_interpolate24_nearest_neighbor_pkd3(srcPtr + srcIdx, srcStridesNH.y, &locSrc_f16, &srcRoi_i4, &dst_f24); + rpp_hip_pack_float24_pkd3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24); +} + +template +__global__ void jitter_pln3_pkd3_tensor(T *srcPtr, + uint3 srcStridesNCH, + T *dstPtr, + uint2 dstStridesNH, + uint *kernelsize, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + uint *xorwowSeedStream, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * srcStridesNCH.x); + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + (id_x * 3); + uint seedStreamIdx = (id_y * dstStridesNH.y) + (hipBlockIdx_x * hipBlockDim_x) + hipThreadIdx_x; + uint kernelSize = kernelsize[id_z]; + uint bound = (kernelSize - 1) / 2; + + RpptXorwowStateBoxMuller xorwowState; + uint xorwowSeed = xorwowSeedStream[seedStreamIdx % SEED_STREAM_MAX_SIZE]; + xorwowState.x[0] = xorwowInitialStatePtr->x[0] + xorwowSeed; + xorwowState.x[1] = xorwowInitialStatePtr->x[1] + xorwowSeed; + xorwowState.x[2] = xorwowInitialStatePtr->x[2] + xorwowSeed; + xorwowState.x[3] = xorwowInitialStatePtr->x[3] + xorwowSeed; + xorwowState.x[4] = xorwowInitialStatePtr->x[4] + xorwowSeed; + xorwowState.counter = xorwowInitialStatePtr->counter + xorwowSeed; + + int4 srcRoi_i4 = *(int4 *)&roiTensorPtrSrc[id_z]; + d_float16 locSrc_f16; + jitter_roi_and_srclocs_hip_compute(&srcRoi_i4, &xorwowState, kernelSize, bound, id_x, id_y, &locSrc_f16); + + d_float24 dst_f24; + rpp_hip_interpolate24_nearest_neighbor_pln3(srcPtr + srcIdx, &srcStridesNCH, &locSrc_f16, &srcRoi_i4, &dst_f24); + rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_jitter_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + uint *kernelSizeTensor, + RpptXorwowStateBoxMuller *xorwowInitialStatePtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle); + + int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = dstDescPtr->n; + + Rpp32u *xorwowSeedStream; + xorwowSeedStream = (Rpp32u *)&xorwowInitialStatePtr[1]; + CHECK_RETURN_STATUS(hipMemcpyAsync(xorwowSeedStream, rngSeedStream4050, SEED_STREAM_MAX_SIZE * sizeof(Rpp32u), hipMemcpyHostToDevice, handle.GetStream())); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(jitter_pkd_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(jitter_pln_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + dstDescPtr->c, + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(jitter_pkd3_pln3_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride), + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(jitter_pln3_pkd3_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride), + dstPtr, + make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride), + kernelSizeTensor, + xorwowInitialStatePtr, + xorwowSeedStream, + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/non_silent_region_detection.hpp b/src/modules/hip/kernel/non_silent_region_detection.hpp new file mode 100644 index 000000000..80511464b --- /dev/null +++ b/src/modules/hip/kernel/non_silent_region_detection.hpp @@ -0,0 +1,426 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - moving mean square kernel device helpers -------------------- + +// calculate the position in shared memory to avoid bank conflicts +__host__ __device__ __forceinline__ int compute_pos_in_smem(int pos) +{ + return pos + (pos >> 5); // since shared memory banks considered is 32 +} + +/* compute prefix sum on the input buffer passed + prefix sum of an array is an array where each element is the sum of all previous elements in the input array, inclusive of the current element */ +__device__ __forceinline__ void compute_prefix_sum(float *input, uint bufferLength) +{ + int offset = 1; + int2 offset_i2 = static_cast(offset); + int2 offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1); + int threadIdxMul2 = 2 * hipThreadIdx_x; + int blockDimMul2 = 2 * hipBlockDim_x; + + /* compute intermediate prefix sums in a up sweep manner + (each level in the hierarchy doubles the distance between the pairs of elements being added) */ + for (int d = bufferLength >> 1; d > 0; d >>= 1) + { + // syncthreads before proceeding to next iteration + __syncthreads(); + int dMul2 = 2 * d; + for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2) + { + int2 pos_i2 = (offset_i2 * static_cast(idxMul2)) + offsetAB_i2; + input[compute_pos_in_smem(pos_i2.y)] += input[compute_pos_in_smem(pos_i2.x)]; + } + offset <<= 1; + offset_i2 = static_cast(offset); + offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1); + } + + if (hipThreadIdx_x == 0) + { + int last = bufferLength - 1; + input[compute_pos_in_smem(last)] = 0; + } + + /* compute final prefix sums in a down sweep manner + (each level in the hierarchy halves the distance between the pairs of elements being added) */ + for (int d = 1; d < bufferLength; d <<= 1) + { + offset >>= 1; + offset_i2 = static_cast(offset); + offsetAB_i2 = make_int2(offset - 1, 2 * offset - 1); + __syncthreads(); + // syncthreads before proceeding to next iteration + + int dMul2 = 2 * d; + for (int idxMul2 = threadIdxMul2; idxMul2 < dMul2; idxMul2 += blockDimMul2) + { + int2 pos_i2 = offset_i2 * static_cast(idxMul2) + offsetAB_i2; + int posA = compute_pos_in_smem(pos_i2.x); + int posB = compute_pos_in_smem(pos_i2.y); + float t = input[posA]; + input[posA] = input[posB]; + input[posB] += t; + } + } + __syncthreads(); +} + +// -------------------- Set 1 - moving mean square compute kernel -------------------- + +__global__ void moving_mean_square_hip_tensor(float *srcPtr, + uint nStride, + float *mmsArr, + int *srcLengthTensor, + int outputTileLength, + int windowLength, + float windowFactor, + int inputTileLength) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint srcLength = srcLengthTensor[id_z]; + uint batchStride = id_z * nStride; + int blockStart = hipBlockIdx_x * outputTileLength; + + if (blockStart >= srcLength) + return; + + float *input = srcPtr + batchStride; + extern __shared__ float squaredPrefixSum_smem[]; + + float *inBlockPtr = srcPtr + batchStride + blockStart; + float *outBlockPtr = mmsArr + batchStride + blockStart; + + // find the valid output tile length values needed for given block + int validOutputTileLength = std::min(outputTileLength, srcLength - blockStart); + + // assign pointers that points to block begin and block end locations + float *extendedBlockStart = inBlockPtr - windowLength; + float *extendedBlockEnd = inBlockPtr + validOutputTileLength; + + // load input data to shared memory + for(int pos = hipThreadIdx_x; pos < inputTileLength; pos += hipBlockDim_x) + { + float val = 0.0f; + auto extendedBlockPtr = extendedBlockStart + pos; + + /* check if extendedBlockPtr is within the valid region of input + and load the value from extendedBlockPtr if it is within valid region */ + if (extendedBlockPtr >= input && extendedBlockPtr < extendedBlockEnd) + val = *extendedBlockPtr; + squaredPrefixSum_smem[compute_pos_in_smem(pos)] = val * val; + } + + // compute prefix sum + compute_prefix_sum(squaredPrefixSum_smem, inputTileLength); + + // compute the mms value here + for(int pos = hipThreadIdx_x; pos < validOutputTileLength; pos += hipBlockDim_x) + outBlockPtr[pos] = windowFactor * ((inBlockPtr[pos] * inBlockPtr[pos]) + squaredPrefixSum_smem[compute_pos_in_smem(windowLength + pos)] - squaredPrefixSum_smem[compute_pos_in_smem(pos + 1)]); +} + +// -------------------- Set 2 - kernels for finding cutoffmag value -------------------- + +__global__ void max_reduction_hip_tensor(float *srcPtr, + uint nStride, + float *maxArr, + int *srcLengthTensor) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint srcLength = srcLengthTensor[id_z]; + + uint srcIdx = id_z * nStride; + __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block + max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads + + if (id_x >= srcLength) + return; + + if (id_x + 8 > srcLength) + id_x -= (id_x + 8 - srcLength); + + srcIdx += id_x; + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory + rpp_hip_math_max8(&src_f8, &max_smem[hipThreadIdx_x]); + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + max_smem[hipThreadIdx_x] = fmaxf(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + { + int dstIdx = id_z * hipGridDim_x + hipBlockIdx_x; + maxArr[dstIdx] = max_smem[0]; + } +} + +__global__ void cutoffmag_hip_tensor(float *srcPtr, + int maxLength, + float *cutOffMagPtr, + float cutOff, + float referencePower, + bool referenceMax) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + // if referenceMax is set to true, perform final max reduction on srcPtr and compute cutOffMag + if(referenceMax) + { + uint srcIdx = id_z * maxLength; + __shared__ float max_smem[256]; // 256 values of src in a 256 x 1 thread block + max_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 256 threads + + if (id_x >= maxLength) + return; + + srcIdx += id_x; + float maxVal = srcPtr[srcIdx]; + while (id_x < maxLength) + { + maxVal = fmaxf(maxVal, srcPtr[srcIdx]); + id_x += hipBlockDim_x; + srcIdx += hipBlockDim_x; + } + max_smem[hipThreadIdx_x] = maxVal; + __syncthreads(); // syncthreads after max compute + + // Reduction of 256 floats on 256 threads per block in x dimension + for (int threadMax = 128; threadMax >= 1; threadMax /= 2) + { + if (hipThreadIdx_x < threadMax) + max_smem[hipThreadIdx_x] = max(max_smem[hipThreadIdx_x], max_smem[hipThreadIdx_x + threadMax]); + __syncthreads(); + } + + // Final store to dst + if (hipThreadIdx_x == 0) + cutOffMagPtr[id_z] = max_smem[0] * cutOff; + } + else + { + if (hipThreadIdx_x == 0) + cutOffMagPtr[id_z] = referencePower * cutOff; + } +} + +// -------------------- Set 3 - kernels for finding begin and length of NSR in inputs -------------------- + +__global__ void find_region_hip_tensor(float *srcPtr, + uint nStride, + int *beginTensor, + int *lengthTensor, + float *cutOffMagPtr, + int *srcLengthTensor, + float windowLength) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + uint srcLength = srcLengthTensor[id_z]; + float cutOffMag = cutOffMagPtr[id_z]; + + __shared__ int beginResult; + __shared__ int endResult; + beginResult = srcLength; + endResult = 0; + __syncthreads(); + + int beginIdx = srcLength; + int endIdx = 0; + uint stridePerSample = id_z * nStride; + + // Find the begin index in src whose value is >= cutOffMag + for (int i = id_x; i < srcLength; i += hipBlockDim_x) + { + uint srcIdx = stridePerSample + i; + if (srcPtr[srcIdx] >= cutOffMag) + { + beginIdx = i; + atomicMin(&beginResult, beginIdx); + if(beginResult != srcLength) + break; + } + } + + // Find the end index in src whose value is >= cutOffMag + for (int i = id_x; i < srcLength; i += hipBlockDim_x) + { + uint srcIdx = stridePerSample + srcLength - 1 - i; + if (srcPtr[srcIdx] >= cutOffMag) + { + endIdx = srcLength - 1 - i; + atomicMax(&endResult, endIdx); + if(endResult != 0) + break; + } + } + + // Final store to dst + if(hipThreadIdx_x == 0) + { + if(beginResult == srcLength || endResult == 0) + { + beginTensor[id_z] = 0; + lengthTensor[id_z] = 0; + } + else + { + int detectBegin = beginResult; + int detectEnd = endResult - beginResult + 1; + + // if both starting index and length of nonsilent region is not 0 + // adjust the values as per the windowLength + if(detectBegin != 0 && detectEnd != 0) + { + int newBegin = max(detectBegin - (windowLength - 1), 0); + detectEnd += detectBegin - newBegin; + detectBegin = newBegin; + } + beginTensor[id_z] = detectBegin; + lengthTensor[id_z] = detectEnd; + } + } +} + +// -------------------- Set 4 - host helpers for kernel executor -------------------- + +// return the nearest previous power of 2 for the given number +inline Rpp32s prev_pow2(Rpp32s n) +{ + Rpp32s pow2 = 1; + while (n - pow2 > pow2) + pow2 += pow2; + + return pow2; +} + +// return the nearest next power of 2 for the given number +inline Rpp32s next_pow2(Rpp32s n) +{ + Rpp32s pow2 = 1; + while (n > pow2) + pow2 += pow2; + + return pow2; +} + +// -------------------- Set 5 - non silent region kernels executor -------------------- + +RppStatus hip_exec_non_silent_region_detection_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32s *srcLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, + Rpp32f cutOffDB, + Rpp32s windowLength, + Rpp32f referencePower, + Rpp32s resetInterval, + rpp::Handle& handle) +{ + // check if scratch memory size required for moving mean square is within the limits + if ((srcDescPtr->n * srcDescPtr->strides.nStride) > MMS_MAX_SCRATCH_MEMORY) + return RPP_ERROR_OUT_OF_BOUND_SCRATCH_MEMORY_SIZE; + + Rpp32f *mmsArr = handle.GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem; + Rpp32s maxSharedMemoryInBytes = handle.GetLocalMemorySize(); + Rpp32s maxSharedMemoryElements = maxSharedMemoryInBytes / sizeof(Rpp32f); + Rpp32s kSharedMemBanks = 32; + Rpp32s inputTileLength = prev_pow2(maxSharedMemoryElements * kSharedMemBanks / (kSharedMemBanks + 1)); + + if (resetInterval > 0 && resetInterval < inputTileLength) + { + Rpp32s p = prev_pow2(resetInterval); + Rpp32s n = next_pow2(resetInterval); + if (p > windowLength) + inputTileLength = p; + else if (n < inputTileLength) + inputTileLength = n; + } + + Rpp32s sharedMemorySizeInBytes = compute_pos_in_smem(inputTileLength) * sizeof(Rpp32f); + Rpp32s outputTileLength = inputTileLength - windowLength; + Rpp32f windowFactor = 1.0f / windowLength; + + if (outputTileLength <= 0) + return RPP_ERROR_INVALID_OUTPUT_TILE_LENGTH; + + if (sharedMemorySizeInBytes > maxSharedMemoryInBytes) + return RPP_ERROR_OUT_OF_BOUND_SHARED_MEMORY_SIZE; + + // launch kernel to compute the values needed for MMS Array + Rpp32s globalThreads_x = ceil(static_cast(srcDescPtr->strides.nStride) / outputTileLength); + Rpp32s globalThreads_y = 1; + Rpp32s globalThreads_z = srcDescPtr->n; + + hipLaunchKernelGGL(moving_mean_square_hip_tensor, + dim3(globalThreads_x, globalThreads_y, globalThreads_z), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + sharedMemorySizeInBytes, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + mmsArr, + srcLengthTensor, + outputTileLength, + windowLength, + windowFactor, + inputTileLength); + + const Rpp32f cutOff = std::pow(10.0f, cutOffDB * 0.1f); + bool referenceMax = (!referencePower); + Rpp32f *partialMaxArr = mmsArr + srcDescPtr->n * srcDescPtr->strides.nStride; + + Rpp32s numBlocksPerSample = ceil(static_cast(srcDescPtr->strides.nStride) / (LOCAL_THREADS_X_1DIM * 8)); + Rpp32s cutOffMagKernelBlockSize = 1; + if (referenceMax) + { + // compute max value in MMS buffer + hipLaunchKernelGGL(max_reduction_hip_tensor, + dim3(numBlocksPerSample, 1, globalThreads_z), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + mmsArr, + srcDescPtr->strides.nStride, + partialMaxArr, + srcLengthTensor); + cutOffMagKernelBlockSize = 256; + } + // find the cutoff value in magnitude + Rpp32f *cutOffMagPtr = partialMaxArr + globalThreads_z * numBlocksPerSample; + hipLaunchKernelGGL(cutoffmag_hip_tensor, + dim3(1, 1, globalThreads_z), + dim3(cutOffMagKernelBlockSize, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + partialMaxArr, + numBlocksPerSample, + cutOffMagPtr, + cutOff, + referencePower, + referenceMax); + + // find the begin and length values of NSR in inputs + hipLaunchKernelGGL(find_region_hip_tensor, + dim3(1, 1, globalThreads_z), + dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + mmsArr, + srcDescPtr->strides.nStride, + detectedIndexTensor, + detectionLengthTensor, + cutOffMagPtr, + srcLengthTensor, + windowLength); + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index c0d8c9204..e05404731 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -281,6 +281,44 @@ RppStatus rppt_resample_host(RppPtr_t srcPtr, #ifdef GPU_SUPPORT +/******************** non_silent_region_detection ********************/ + +RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + Rpp32s *srcLengthTensor, + Rpp32s *detectedIndexTensor, + Rpp32s *detectionLengthTensor, + Rpp32f cutOffDB, + Rpp32s windowLength, + Rpp32f referencePower, + Rpp32s resetInterval, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if (srcDescPtr->dataType == RpptDataType::F32) + { + + return hip_exec_non_silent_region_detection_tensor(static_cast(srcPtr), + srcDescPtr, + srcLengthTensor, + detectedIndexTensor, + detectionLengthTensor, + cutOffDB, + windowLength, + referencePower, + resetInterval, + rpp::deref(rppHandle)); + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } + +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + /******************** to_decibels ********************/ RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, @@ -325,5 +363,4 @@ RppStatus rppt_to_decibels_gpu(RppPtr_t srcPtr, } #endif // GPU_SUPPORT - -#endif // AUDIO_SUPPORT +#endif // AUDIO_SUPPORT \ No newline at end of file diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp index 80e3d3a10..8fc2d00ee 100644 --- a/src/modules/rppt_tensor_effects_augmentations.cpp +++ b/src/modules/rppt_tensor_effects_augmentations.cpp @@ -932,6 +932,78 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, return RPP_SUCCESS; } +/******************** jitter ********************/ + +RppStatus rppt_jitter_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + Rpp32u seed, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + RpptXorwowStateBoxMuller xorwowInitialState[SIMD_FLOAT_VECTOR_LENGTH]; + rpp_host_rng_xorwow_f32_initialize_multiseed_stream_boxmuller(xorwowInitialState, seed); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + jitter_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + jitter_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + jitter_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + jitter_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + xorwowInitialState, + roiTensorPtrSrc, + roiType, + layoutParams, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -1641,6 +1713,8 @@ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** erase ********************/ + RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, @@ -1850,4 +1924,87 @@ RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, #endif // backend } +/******************** jitter ********************/ + +RppStatus rppt_jitter_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32u *kernelSizeTensor, + Rpp32u seed, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + + RpptXorwowStateBoxMuller xorwowInitialState; + xorwowInitialState.x[0] = 0x75BCD15 + seed; + xorwowInitialState.x[1] = 0x159A55E5 + seed; + xorwowInitialState.x[2] = 0x1F123BB5 + seed; + xorwowInitialState.x[3] = 0x5491333 + seed; + xorwowInitialState.x[4] = 0x583F19 + seed; + xorwowInitialState.counter = 0x64F0C9 + seed; + xorwowInitialState.boxMullerFlag = 0; + xorwowInitialState.boxMullerExtra = 0.0f; + + RpptXorwowStateBoxMuller *d_xorwowInitialStatePtr; + d_xorwowInitialStatePtr = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + CHECK_RETURN_STATUS(hipMemcpy(d_xorwowInitialStatePtr, &xorwowInitialState, sizeof(RpptXorwowStateBoxMuller), hipMemcpyHostToDevice)); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_jitter_tensor(reinterpret_cast(static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + (half*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_jitter_tensor((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes), + srcDescPtr, + (Rpp32f*) (static_cast(dstPtr) + dstDescPtr->offsetInBytes), + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + hip_exec_jitter_tensor(static_cast(srcPtr) + srcDescPtr->offsetInBytes, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offsetInBytes, + dstDescPtr, + kernelSizeTensor, + d_xorwowInitialStatePtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/utilities/test_suite/CMakeLists.txt b/utilities/test_suite/CMakeLists.txt index bb5987779..23515798b 100644 --- a/utilities/test_suite/CMakeLists.txt +++ b/utilities/test_suite/CMakeLists.txt @@ -120,6 +120,15 @@ if(Python3_FOUND) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endif(NIFTI_FOUND) + if(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND) + if(libsnd_LIBS) + add_test( + NAME rpp_qa_tests_tensor_audio_hip_all + COMMAND ${Python3_EXECUTABLE} ${ROCM_PATH}/share/rpp/test/HIP/runAudioTests.py --qa_mode 1 --batch_size 3 + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + endif(libsnd_LIBS) + endif(RPP_AUDIO_AUGMENTATIONS_SUPPORT_FOUND) elseif( "${BACKEND}" STREQUAL "OCL") # TBD: Add OCL Tests diff --git a/utilities/test_suite/HIP/CMakeLists.txt b/utilities/test_suite/HIP/CMakeLists.txt index 891a76353..814b006fb 100644 --- a/utilities/test_suite/HIP/CMakeLists.txt +++ b/utilities/test_suite/HIP/CMakeLists.txt @@ -117,21 +117,23 @@ else() message("-- ${Yellow}Warning: libniftiio must be installed to install ${PROJECT_NAME}/Tensor_voxel_hip successfully!${ColourReset}") endif() -if(NOT libsnd_LIBS) - message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_host successfully!${ColourReset}") -else() - message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}") - set(COMPILER_FOR_HIP ${ROCM_PATH}/bin/hipcc) - set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP}) - include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include) - link_directories(${ROCM_PATH}/lib /usr/local/lib) - include_directories(${SndFile_INCLUDE_DIRS}) - link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/) - - add_executable(Tensor_audio_hip Tensor_audio_hip.cpp) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") - if(NOT APPLE) - set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs) +if(RPP_AUDIO_SUPPORT) + if(NOT libsnd_LIBS) + message("-- ${Yellow}Warning: libsndfile must be installed to install ${PROJECT_NAME}/Tensor_audio_hip successfully!${ColourReset}") + else() + message("-- ${Green}${PROJECT_NAME} set to build with rpp and libsndfile ${ColourReset}") + set(COMPILER_FOR_HIP ${ROCM_PATH}/bin/hipcc) + set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP}) + include_directories(${ROCM_PATH}/include ${ROCM_PATH}/include/rpp /usr/local/include) + link_directories(${ROCM_PATH}/lib /usr/local/lib) + include_directories(${SndFile_INCLUDE_DIRS}) + link_directories(${SndFile_LIBRARIES_DIR} /usr/local/lib/) + + add_executable(Tensor_audio_hip Tensor_audio_hip.cpp) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") + if(NOT APPLE) + set(LINK_LIBRARY_LIST ${LINK_LIBRARY_LIST} stdc++fs) + endif() + target_link_libraries(Tensor_audio_hip ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST}) endif() - target_link_libraries(Tensor_audio_hip ${libsnd_LIBS} -lsndfile -lrpp pthread ${LINK_LIBRARY_LIST}) endif() diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp index 1b6014493..ba7a3d46a 100644 --- a/utilities/test_suite/HIP/Tensor_audio_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_audio_hip.cpp @@ -108,8 +108,7 @@ int main(int argc, char **argv) if(testCase == 3) maxDstChannels = 1; set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes); - srcDescPtr->numDims = 2; - dstDescPtr->numDims = 2; + // set buffer sizes for src/dst iBufferSize = (Rpp64u)srcDescPtr->h * (Rpp64u)srcDescPtr->w * (Rpp64u)srcDescPtr->c * (Rpp64u)srcDescPtr->n; oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; @@ -132,7 +131,7 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostMalloc(&srcDims, batchSize * sizeof(RpptImagePatch))); CHECK_RETURN_STATUS(hipHostMalloc(&dstDims, batchSize * sizeof(RpptImagePatch))); - Rpp32f *detectedIndex = nullptr, *detectionLength = nullptr; + Rpp32s *detectedIndex = nullptr, *detectionLength = nullptr; if(testCase == 0) { CHECK_RETURN_STATUS(hipHostMalloc(&detectedIndex, batchSize * sizeof(Rpp32f))); @@ -160,6 +159,19 @@ int main(int argc, char **argv) double wallTime; switch (testCase) { + case 0: + { + testCaseName = "non_silent_region_detection"; + Rpp32f cutOffDB = -60.0; + Rpp32s windowLength = 2048; + Rpp32f referencePower = 0.0f; + Rpp32s resetInterval = 8192; + + startWallTime = omp_get_wtime(); + rppt_non_silent_region_detection_gpu(d_inputf32, srcDescPtr, srcLengthTensor, detectedIndex, detectionLength, cutOffDB, windowLength, referencePower, resetInterval, handle); + + break; + } case 1: { testCaseName = "to_decibels"; @@ -203,11 +215,14 @@ int main(int argc, char **argv) if (testType == 0) { CHECK_RETURN_STATUS(hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost)); + CHECK_RETURN_STATUS(hipDeviceSynchronize()); /* Run only if testCase is not 0 For testCase 0 verify_non_silent_region_detection function is used for QA testing */ if (testCase != 0) verify_output(outputf32, dstDescPtr, dstDims, testCaseName, dst, scriptPath, "HIP"); + else + verify_non_silent_region_detection(detectedIndex, detectionLength, testCaseName, batchSize, audioNames, dst); /* Dump the outputs to csv files for debugging Runs only if @@ -240,14 +255,14 @@ int main(int argc, char **argv) cout << endl; // free memory - free(srcDims); - free(dstDims); free(inputf32); free(outputf32); CHECK_RETURN_STATUS(hipFree(d_inputf32)); CHECK_RETURN_STATUS(hipFree(d_outputf32)); CHECK_RETURN_STATUS(hipHostFree(srcLengthTensor)); CHECK_RETURN_STATUS(hipHostFree(channelsTensor)); + CHECK_RETURN_STATUS(hipHostFree(srcDims)); + CHECK_RETURN_STATUS(hipHostFree(dstDims)); if (detectedIndex != nullptr) CHECK_RETURN_STATUS(hipHostFree(detectedIndex)); if (detectionLength != nullptr) diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index aad78241e..ec1b47d9b 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -66,7 +66,8 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54 || testCase == 79); bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54); bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); - bool randomOutputCase = (testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54); + bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84 || testCase == 49 || testCase == 54); + bool nonQACase = (testCase == 24); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); bool noiseTypeCase = (testCase == 8); @@ -406,6 +407,10 @@ int main(int argc, char **argv) if(testCase == 46) CHECK_RETURN_STATUS(hipHostMalloc(&intensity, batchSize * sizeof(Rpp32f))); + Rpp32u *kernelSizeTensor; + if(testCase == 6) + CHECK_RETURN_STATUS(hipHostMalloc(&kernelSizeTensor, batchSize * sizeof(Rpp32u))); + RpptChannelOffsets *rgbOffsets; if(testCase == 35) CHECK_RETURN_STATUS(hipHostMalloc(&rgbOffsets, batchSize * sizeof(RpptChannelOffsets))); @@ -561,6 +566,22 @@ int main(int argc, char **argv) break; } + case 6: + { + testCaseName = "jitter"; + + Rpp32u seed = 1255459; + for (i = 0; i < batchSize; i++) + kernelSizeTensor[i] = 5; + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_jitter_gpu(d_input, srcDescPtr, d_output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 8: { testCaseName = "noise"; @@ -709,6 +730,36 @@ int main(int argc, char **argv) break; } + case 24: + { + testCaseName = "warp_affine"; + + if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR)) + { + missingFuncFlag = 1; + break; + } + + Rpp32f6 affineTensor_f6[batchSize]; + Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6; + for (i = 0; i < batchSize; i++) + { + affineTensor_f6[i].data[0] = 1.23; + affineTensor_f6[i].data[1] = 0.5; + affineTensor_f6[i].data[2] = 0; + affineTensor_f6[i].data[3] = -0.8; + affineTensor_f6[i].data[4] = 0.83; + affineTensor_f6[i].data[5] = 0; + } + + startWallTime = omp_get_wtime(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_warp_affine_gpu(d_input, srcDescPtr, d_output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 26: { testCaseName = "lens_correction"; @@ -1448,7 +1499,7 @@ int main(int argc, char **argv) 1.QA Flag is set 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ - if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) + if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase)) { if (testCase == 87) compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); @@ -1516,7 +1567,7 @@ int main(int argc, char **argv) 2.input bit depth 0 (Input U8 && Output U8) 3.source and destination layout are the same 4.augmentation case does not generate random output*/ - if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase)) + if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase)) compare_output(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath); // Calculate exact dstROI in XYWH format for OpenCV dump @@ -1603,6 +1654,8 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostFree(shapeTensor)); if(roiTensor != NULL) CHECK_RETURN_STATUS(hipHostFree(roiTensor)); + if(testCase == 6) + CHECK_RETURN_STATUS(hipHostFree(kernelSizeTensor)); free(input); free(input_second); free(output); diff --git a/utilities/test_suite/HIP/runAudioTests.py b/utilities/test_suite/HIP/runAudioTests.py index e55010b38..f500b7f0e 100644 --- a/utilities/test_suite/HIP/runAudioTests.py +++ b/utilities/test_suite/HIP/runAudioTests.py @@ -35,7 +35,7 @@ inFilePath = scriptPath + "/../TEST_AUDIO_FILES/three_samples_single_channel_src1" outFolderPath = os.getcwd() buildFolderPath = os.getcwd() -caseMin = 1 +caseMin = 0 caseMax = 1 @@ -224,7 +224,7 @@ def rpp_test_suite_parser_and_validator(): subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['1'] +supportedCaseList = ['0', '1'] if qaMode and batchSize != 3: print("QA tests can only run with a batch size of 3.") exit(0) diff --git a/utilities/test_suite/HIP/runMiscTests.py b/utilities/test_suite/HIP/runMiscTests.py index f4adbde28..ee97f4547 100644 --- a/utilities/test_suite/HIP/runMiscTests.py +++ b/utilities/test_suite/HIP/runMiscTests.py @@ -74,24 +74,25 @@ def generate_performance_reports(RESULTS_DIR): print(dfPrint_noIndices) def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg): - print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}") - result = subprocess.run([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg): - with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile: - print(f"./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_misc_hip_raw_performance_log.txt", "a") as logFile: + logFile.write("./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n") + process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) def run_performance_test_with_profiler_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg): - if not os.path.exists(f"{outFilePath}/case_{case}"): - os.mkdir(f"{outFilePath}/case_{case}") + if not os.path.exists(outFilePath + "/case_" + str(case)): + os.mkdir(outFilePath + "/case_" + str(case)) - with open("{}/Tensor_misc_hip_raw_performance_log.txt".format(loggingFolder), "a") as logFile: - print(f"\nrocprof --basenames on --timestamp on --stats -o {outFilePath}/case_{case}/output_case{case}.csv ./Tensor_misc_hip {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}") - process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{outFilePath}/case_{case}/output_case{case}.csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_misc_hip_raw_performance_log.txt", "a") as logFile: + logFile.write("\nrocprof --basenames on --timestamp on --stats -o " + outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_misc_hip " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n") + process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', outFilePath + "/case_" + str(case) + "/output_case" + str(case) + ".csv", "./Tensor_misc_hip", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) print("------------------------------------------------------------------------------------------") @@ -206,8 +207,8 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec supportedCaseList = ['0', '1', '2'] for case in caseList: @@ -253,7 +254,7 @@ def rpp_test_suite_parser_and_validator(): continue new_file.close() - subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), CONSOLIDATED_FILE]) # nosec + subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), CONSOLIDATED_FILE]) # nosec try: generate_performance_reports(RESULTS_DIR) except ImportError: diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py index 01da79c8d..cb4bc8bda 100644 --- a/utilities/test_suite/HIP/runTests.py +++ b/utilities/test_suite/HIP/runTests.py @@ -70,35 +70,39 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo if case == "40" or case == "41" or case == "49" or case == "54": for kernelSize in range(3, 10, 2): - print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {kernelSize}") - result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(kernelSize), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(kernelSize)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(kernelSize), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) elif case == "8": # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise for noiseType in range(3): - print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {noiseType} ") - result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(noiseType)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) elif case == "21" or case == "23" or case == "24" or case == "79": # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular interpolationRange = 6 if case =='79': interpolationRange = 2 for interpolationType in range(interpolationRange): - print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {interpolationType}") - result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(interpolationType)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) else: - print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} 0 {numRuns} {testType} {layout}") - result = subprocess.run([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " 0 " + str(numRuns) + " " + str(testType) + " " + str(layout)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList): - with open("{}/Tensor_hip_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile: - print(f"./Tensor_hip {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + print("./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam)) + process = subprocess.Popen([buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList): @@ -133,11 +137,11 @@ def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPa def run_performance_test_with_profiler(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, additionalParamType, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList): addtionalParamString = additionalParamType + str(additionalParam) layoutName = get_layout_name(layout) - if not os.path.isdir(f"{dstPath}/Tensor_{layoutName}/case_{case}"): - os.mkdir(f"{dstPath}/Tensor_{layoutName}/case_{case}") - with open(f"{loggingFolder}/Tensor_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile: - print(f'rocprof --basenames on --timestamp on --stats -o {dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv ./Tensor_hip {srcPath1} {srcPath2} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0') - process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f'{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}_bitDepth{bitDepth}_oft{outputFormatToggle}{addtionalParamString}.csv', buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec + if not os.path.isdir(dstPath + "/Tensor_" + layoutName + "/case_" + str(case)): + os.makedirs(dstPath + "/Tensor_" + layoutName + "/case_" + str(case)) + with open(loggingFolder + "/Tensor_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + logFile.write("rocprof --basenames on --timestamp on --stats -o " + dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + "_bitDepth" + str(bitDepth) + "_oft" + addtionalParamString + ".csv ./Tensor_hip " + srcPath1 + " " + srcPath2 + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam) + " 0\n") + process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + "_bitDepth" + str(bitDepth) + "_oft" + addtionalParamString + ".csv", buildFolderPath + "/build/Tensor_hip", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), '0', str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec while True: output = process.stdout.readline() if not output and process.poll() is not None: @@ -172,7 +176,7 @@ def rpp_test_suite_parser_and_validator(): # validate the parameters passed by user if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)): - print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!") + print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -214,7 +218,7 @@ def rpp_test_suite_parser_and_validator(): else: for case in args.case_list: if int(case) < caseMin or int(case) > caseMax: - print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!") + print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!") exit(0) return args @@ -272,17 +276,17 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": - os.makedirs(f"{dstPath}/Tensor_PKD3") - os.makedirs(f"{dstPath}/Tensor_PLN1") - os.makedirs(f"{dstPath}/Tensor_PLN3") + os.makedirs(dstPath + "/Tensor_PKD3") + os.makedirs(dstPath + "/Tensor_PLN1") + os.makedirs(dstPath + "/Tensor_PLN3") print("\n\n\n\n\n") print("##########################################################################################") @@ -453,7 +457,7 @@ def rpp_test_suite_parser_and_validator(): continue new_file.close() - subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec + subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec try: generate_performance_reports(d_counter, TYPE_LIST, RESULTS_DIR) @@ -484,7 +488,7 @@ def rpp_test_suite_parser_and_validator(): print_performance_tests_summary(logFile, functionalityGroupList, numRuns) # print the results of qa tests -nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support +nonQACaseList = ['6', '8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: qaFilePath = os.path.join(outFilePath, "QA_results.txt") diff --git a/utilities/test_suite/HIP/runVoxelTests.py b/utilities/test_suite/HIP/runVoxelTests.py index f3ad38025..31c9dd22f 100644 --- a/utilities/test_suite/HIP/runVoxelTests.py +++ b/utilities/test_suite/HIP/runVoxelTests.py @@ -57,20 +57,23 @@ def func_group_finder(case_number): return "miscellaneous" def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize): - print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}") - result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize): - with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile: - print(f"./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_voxel_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + logFile.write("./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n") + process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec while True: output = process.stdout.readline() if not output and process.poll() is not None: break - print(output.strip()) + output = output.decode().strip() # Decode bytes to string and strip extra whitespace + print(output) + logFile.write(output) if "Running" in output or "max,min,avg wall times" in output: cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126) # Remove control characters cleanedOutput = cleanedOutput.strip() # Remove leading/trailing whitespace @@ -81,14 +84,15 @@ def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, def run_performance_test_with_profiler_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize): layoutName = get_layout_name(layout) - if not os.path.exists(f"{loggingFolder}/Tensor_{layoutName}/case_{case}"): - os.mkdir(f"{loggingFolder}/Tensor_{layoutName}/case_{case}") + directory_path = os.path.join(loggingFolder, "Tensor_" + layoutName, "case_" + str(case)) + if not os.path.exists(directory_path): + os.mkdir(directory_path) bitDepths = [0, 2] for bitDepth in bitDepths: - with open(f"{loggingFolder}/Tensor_voxel_hip_{logFileLayout}_raw_performance_log.txt", "a") as logFile: - print(f"\nrocprof --basenames on --timestamp on --stats -o {dstPathTemp}/Tensor_{layoutName}/case_{case}/output_case{case}.csv ./Tensor_voxel_hip {headerPath} {dataPath} {dstPathTemp} {layout} {case}{numRuns} {testType} {qaMode} {batchSize} {bitDepth}") - process = subprocess.Popen([ 'rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', f"{dstPath}/Tensor_{layoutName}/case_{case}/output_case{case}.csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec + with open(loggingFolder + "/Tensor_voxel_hip_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + logFile.write("\nrocprof --basenames on --timestamp on --stats -o " + dstPathTemp + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + ".csv ./Tensor_voxel_hip " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n") + process = subprocess.Popen(['rocprof', '--basenames', 'on', '--timestamp', 'on', '--stats', '-o', dstPath + "/Tensor_" + layoutName + "/case_" + str(case) + "/output_case" + str(case) + ".csv", buildFolderPath + "/build/Tensor_voxel_hip", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec while True: output = process.stdout.readline() if not output and process.poll() is not None: @@ -227,17 +231,17 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported supportedCaseList = ['0', '1', '2', '3', '4', '5', '6'] # Create folders based on testType and profilingOption if testType == 1 and profilingOption == "YES": - os.makedirs(f"{dstPath}/Tensor_PKD3") - os.makedirs(f"{dstPath}/Tensor_PLN1") - os.makedirs(f"{dstPath}/Tensor_PLN3") + os.makedirs(dstPath + "/Tensor_PKD3") + os.makedirs(dstPath + "/Tensor_PLN1") + os.makedirs(dstPath + "/Tensor_PLN3") print("\n\n\n\n\n") print("##########################################################################################") @@ -322,7 +326,7 @@ def rpp_test_suite_parser_and_validator(): continue new_file.close() - subprocess.call(['chown', '{}:{}'.format(os.getuid(), os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec + subprocess.call(['chown', str(os.getuid()) + ':' + str(os.getgid()), RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv"]) # nosec try: generate_performance_reports(d_counter, TYPE_LIST, RESULTS_DIR) diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp index 4c3d4f0e8..bb1312a5e 100644 --- a/utilities/test_suite/HOST/Tensor_host.cpp +++ b/utilities/test_suite/HOST/Tensor_host.cpp @@ -66,7 +66,8 @@ int main(int argc, char **argv) bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 33 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68); - bool randomOutputCase = (testCase == 8 || testCase == 84); + bool randomOutputCase = (testCase == 6 || testCase == 8 || testCase == 84); + bool nonQACase = (testCase == 24); bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24 || testCase == 79); bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89 || testCase == 90 || testCase == 91); bool noiseTypeCase = (testCase == 8); @@ -517,6 +518,24 @@ int main(int argc, char **argv) break; } + case 6: + { + testCaseName = "jitter"; + + Rpp32u kernelSizeTensor[batchSize]; + Rpp32u seed = 1255459; + for (i = 0; i < batchSize; i++) + kernelSizeTensor[i] = 5; + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_jitter_host(input, srcDescPtr, output, dstDescPtr, kernelSizeTensor, seed, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 8: { testCaseName = "noise"; @@ -672,6 +691,37 @@ int main(int argc, char **argv) break; } + case 24: + { + testCaseName = "warp_affine"; + + if ((interpolationType != RpptInterpolationType::BILINEAR) && (interpolationType != RpptInterpolationType::NEAREST_NEIGHBOR)) + { + missingFuncFlag = 1; + break; + } + + Rpp32f6 affineTensor_f6[batchSize]; + Rpp32f *affineTensor = (Rpp32f *)affineTensor_f6; + for (i = 0; i < batchSize; i++) + { + affineTensor_f6[i].data[0] = 1.23; + affineTensor_f6[i].data[1] = 0.5; + affineTensor_f6[i].data[2] = 0; + affineTensor_f6[i].data[3] = -0.8; + affineTensor_f6[i].data[4] = 0.83; + affineTensor_f6[i].data[5] = 0; + } + + startWallTime = omp_get_wtime(); + startCpuTime = clock(); + if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5) + rppt_warp_affine_host(input, srcDescPtr, output, dstDescPtr, affineTensor, interpolationType, roiTensorPtrSrc, roiTypeSrc, handle); + else + missingFuncFlag = 1; + + break; + } case 26: { testCaseName = "lens_correction"; @@ -1462,7 +1512,7 @@ int main(int argc, char **argv) 1.QA Flag is set 2.input bit depth 0 (U8) 3.source and destination layout are the same*/ - if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase)) + if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase) && !(nonQACase)) { if (testCase == 87) compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath); @@ -1528,7 +1578,7 @@ int main(int argc, char **argv) 2.input bit depth 0 (Input U8 && Output U8) 3.source and destination layout are the same 4.augmentation case does not generate random output*/ - if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase)) + if(qaFlag && inputBitDepth == 0 && ((srcDescPtr->layout == dstDescPtr->layout) || pln1OutTypeCase) && !(randomOutputCase) && !(nonQACase)) compare_output(outputu8, testCaseName, srcDescPtr, dstDescPtr, dstImgSizes, batchSize, interpolationTypeName, noiseTypeName, testCase, dst, scriptPath); // Calculate exact dstROI in XYWH format for OpenCV dump diff --git a/utilities/test_suite/HOST/runAudioTests.py b/utilities/test_suite/HOST/runAudioTests.py index c0600c057..a1771716b 100644 --- a/utilities/test_suite/HOST/runAudioTests.py +++ b/utilities/test_suite/HOST/runAudioTests.py @@ -45,15 +45,16 @@ def get_log_file_list(): ] def run_unit_test_cmd(srcPath, case, numRuns, testType, batchSize, outFilePath): - print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize}") - result = subprocess.run([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_audio_host " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, srcPath, case, numRuns, testType, batchSize, outFilePath): - with open("{}/Tensor_audio_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile: - print(f"./Tensor_audio_host {srcPath} {case} {numRuns} {testType} {numRuns} {batchSize} ") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_audio_host_raw_performance_log.txt", "a") as logFile: + logFile.write("./Tensor_audio_host " + srcPath + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(numRuns) + " " + str(batchSize) + "\n") + process = subprocess.Popen([buildFolderPath + "/build/Tensor_audio_host", srcPath, str(case), str(testType), str(numRuns), str(batchSize), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) print("------------------------------------------------------------------------------------------") @@ -87,7 +88,7 @@ def rpp_test_suite_parser_and_validator(): # validate the parameters passed by user if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)): - print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!") + print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -120,7 +121,7 @@ def rpp_test_suite_parser_and_validator(): else: for case in args.case_list: if int(case) < caseMin or int(case) > caseMax: - print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!") + print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!") exit(0) return args @@ -171,8 +172,8 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported supportedCaseList = ['0', '1', '2', '3', '4', '5', '6', '7'] diff --git a/utilities/test_suite/HOST/runMiscTests.py b/utilities/test_suite/HOST/runMiscTests.py index 0f428fe40..931838f71 100644 --- a/utilities/test_suite/HOST/runMiscTests.py +++ b/utilities/test_suite/HOST/runMiscTests.py @@ -47,15 +47,16 @@ def get_log_file_list(): ] def run_unit_test_cmd(numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg): - print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}") - result = subprocess.run([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_misc_host " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg): - with open("{}/Tensor_misc_host_raw_performance_log.txt".format(loggingFolder), "a") as logFile: - print(f"./Tensor_misc_host {case} {testType} {toggle} {numDims} {batchSize} {numRuns} {additionalArg}") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_misc_host_raw_performance_log.txt", "a") as logFile: + logFile.write("./Tensor_misc_host " + str(case) + " " + str(testType) + " " + str(toggle) + " " + str(numDims) + " " + str(batchSize) + " " + str(numRuns) + " " + str(additionalArg) + "\n") + process = subprocess.Popen([buildFolderPath + "/build/Tensor_misc_host", str(case), str(testType), str(toggle), str(numDims), str(batchSize), str(numRuns), str(additionalArg), outFilePath, scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) def run_test(loggingFolder, numDims, case, numRuns, testType, toggle, batchSize, outFilePath, additionalArg = ""): @@ -162,8 +163,8 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec supportedCaseList = ['0', '1', '2'] for case in caseList: diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py index 93cd64713..7386b364b 100644 --- a/utilities/test_suite/HOST/runTests.py +++ b/utilities/test_suite/HOST/runTests.py @@ -71,34 +71,37 @@ def run_unit_test(srcPath1, srcPath2, dstPathTemp, case, numRuns, testType, layo if case == "8": # Run all variants of noise type functions with additional argument of noiseType = gausssianNoise / shotNoise / saltandpepperNoise for noiseType in range(3): - print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {noiseType} 0 ") - result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(noiseType) + " 0") + result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(noiseType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) elif case == "21" or case == "23" or case == "24" or case == "79": # Run all variants of interpolation functions with additional argument of interpolationType = bicubic / bilinear / gaussian / nearestneigbor / lanczos / triangular interpolationRange = 6 if case =='79': interpolationRange = 2 for interpolationType in range(interpolationRange): - print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} {interpolationType} 0") - result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(interpolationType) + " 0") + result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), str(interpolationType), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) else: - print(f"./Tensor_host {srcPath1} {srcPath2} {dstPathTemp} {bitDepth} {outputFormatToggle} {case} 0 {numRuns} {testType} {layout} 0") - result = subprocess.run([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPathTemp + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " 0 " + str(numRuns) + " " + str(testType) + " " + str(layout) + " 0") + result = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPathTemp, str(bitDepth), str(outputFormatToggle), str(case), "0", str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, bitDepth, outputFormatToggle, case, additionalParam, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList): if qaMode == 1: - with open("{}/BatchPD_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile: - process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/BatchPD_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + process = subprocess.Popen([buildFolderPath + "/build/BatchPD_host_" + logFileLayout, srcPath1, srcPath2, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), "0"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) - with open("{}/Tensor_host_{}_raw_performance_log.txt".format(loggingFolder, logFileLayout), "a") as logFile: - print(f"./Tensor_host {srcPath1} {srcPath2} {dstPath} {bitDepth} {outputFormatToggle} {case} {additionalParam} 0 ") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + logFile.write("./Tensor_host " + srcPath1 + " " + srcPath2 + " " + dstPath + " " + str(bitDepth) + " " + str(outputFormatToggle) + " " + str(case) + " " + str(additionalParam) + " 0\n") + process = subprocess.Popen([buildFolderPath + "/build/Tensor_host", srcPath1, srcPath2, dstPath, str(bitDepth), str(outputFormatToggle), str(case), str(additionalParam), str(numRuns), str(testType), str(layout), "0", str(qaMode), str(decoderType), str(batchSize)] + roiList + [scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec read_from_subprocess_and_write_to_log(process, logFile) def run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList): @@ -154,7 +157,7 @@ def rpp_test_suite_parser_and_validator(): # validate the parameters passed by user if ((args.case_start < caseMin or args.case_start > caseMax) or (args.case_end < caseMin or args.case_end > caseMax)): - print(f"Starting case# and Ending case# must be in the {caseMin}:{caseMax} range. Aborting!") + print("Starting case# and Ending case# must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!") exit(0) elif args.case_end < args.case_start: print("Ending case# must be greater than starting case#. Aborting!") @@ -193,7 +196,7 @@ def rpp_test_suite_parser_and_validator(): else: for case in args.case_list: if int(case) < caseMin or int(case) > caseMax: - print(f"Invalid case number {case}! Case number must be in the {caseMin}:{caseMax} range. Aborting!") + print("Invalid case number " + str(case) + "! Case number must be in the " + str(caseMin) + ":" + str(caseMax) + " range. Aborting!") exit(0) return args @@ -254,11 +257,11 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported -supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] +supportedCaseList = ['0', '1', '2', '4', '6', '8', '13', '20', '21', '23', '26', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92'] print("\n\n\n\n\n") print("##########################################################################################") @@ -309,7 +312,7 @@ def rpp_test_suite_parser_and_validator(): run_performance_test(loggingFolder, logFileLayout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList) # print the results of qa tests -nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support +nonQACaseList = ['6', '8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support if qaMode and testType == 0: qaFilePath = os.path.join(outFilePath, "QA_results.txt") @@ -443,23 +446,23 @@ def rpp_test_suite_parser_and_validator(): passedCases = df['Test_Result'].eq('PASSED').sum() failedCases = df['Test_Result'].eq('FAILED').sum() - summaryRow = {'BatchPD_Augmentation_Type': pd.NA, - 'Tensor_Augmentation_Type': pd.NA, - 'Performance Speedup (%)': pd.NA, - 'Test_Result': f'Final Results of Tests: Passed: {passedCases}, Failed: {failedCases}'} + summaryRow = {'BatchPD_Augmentation_Type': None, + 'Tensor_Augmentation_Type': None, + 'Performance Speedup (%)': None, + 'Test_Result': 'Final Results of Tests: Passed: ' + str(passedCases) + ', Failed: ' + str(failedCases)} - print("\n", df.to_markdown()) + print("\n" + dataframe_to_markdown(df)) # Append the summary row to the DataFrame # Convert the dictionary to a DataFrame summaryRow = pd.DataFrame([summaryRow]) - df = pd.concat([df, summaryRow], ignore_index=True) + df = pd.concat([df, summaryRow], ignore_index=True, sort = True) df.to_excel(excelFilePath, index=False) print("\n-------------------------------------------------------------------" + resultsInfo + "\n\n-------------------------------------------------------------------") print("\nIMPORTANT NOTE:") print("- The following performance comparison shows Performance Speedup percentages between times measured on previous generation RPP-BatchPD APIs against current generation RPP-Tensor APIs.") - print(f"- All APIs have been improved for performance ranging from {0}% (almost same) to {100}% faster.") + print("- All APIs have been improved for performance ranging from " + str(0) + "% (almost same) to " + str(100) + "% faster.") print("- Random observations of negative speedups might always occur due to current test machine temperature/load variances or other CPU/GPU state-dependent conditions.") print("\n-------------------------------------------------------------------\n") elif (testType == 1 and qaMode == 0): diff --git a/utilities/test_suite/HOST/runVoxelTests.py b/utilities/test_suite/HOST/runVoxelTests.py index 3dbe0baa5..f44c05f78 100644 --- a/utilities/test_suite/HOST/runVoxelTests.py +++ b/utilities/test_suite/HOST/runVoxelTests.py @@ -58,20 +58,23 @@ def func_group_finder(case_number): return "miscellaneous" def run_unit_test_cmd(headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize): - print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}") - result = subprocess.run([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec - print(result.stdout.decode()) + print("./Tensor_voxel_host " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth)) + result = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE) # nosec + stdout_data, stderr_data = result.communicate() + print(stdout_data.decode()) print("------------------------------------------------------------------------------------------") def run_performance_test_cmd(loggingFolder, logFileLayout, headerPath, dataPath, dstPathTemp, layout, case, numRuns, testType, qaMode, batchSize): - with open(f"{loggingFolder}/Tensor_voxel_host_{logFileLayout}_raw_performance_log.txt", "a") as logFile: - print(f"./Tensor_voxel_host {headerPath} {dataPath} {dstPathTemp} {layout} {case} {numRuns} {testType} {qaMode} {batchSize} {bitDepth}") - process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) # nosec + with open(loggingFolder + "/Tensor_voxel_host_" + logFileLayout + "_raw_performance_log.txt", "a") as logFile: + logFile.write("./Tensor_voxel_host " + headerPath + " " + dataPath + " " + dstPathTemp + " " + str(layout) + " " + str(case) + " " + str(numRuns) + " " + str(testType) + " " + str(qaMode) + " " + str(batchSize) + " " + str(bitDepth) + "\n") + process = subprocess.Popen([buildFolderPath + "/build/Tensor_voxel_host", headerPath, dataPath, dstPathTemp, str(layout), str(case), str(numRuns), str(testType), str(qaMode), str(batchSize), str(bitDepth), scriptPath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # nosec while True: output = process.stdout.readline() if not output and process.poll() is not None: break - print(output.strip()) + output = output.decode().strip() # Decode bytes to string and strip extra whitespace + print(output) + logFile.write(output) if "Running" in output or "max,min,avg wall times" in output: cleanedOutput = ''.join(char for char in output if 32 <= ord(char) <= 126) # Remove control characters cleanedOutput = cleanedOutput.strip() # Remove leading/trailing whitespace @@ -203,8 +206,8 @@ def rpp_test_suite_parser_and_validator(): os.chdir(buildFolderPath + "/build") # Run cmake and make commands -subprocess.run(["cmake", scriptPath], cwd=".") # nosec -subprocess.run(["make", "-j16"], cwd=".") # nosec +subprocess.call(["cmake", scriptPath], cwd=".") # nosec +subprocess.call(["make", "-j16"], cwd=".") # nosec # List of cases supported supportedCaseList = ['0', '1', '2', '3', '4', '5', '6'] diff --git a/utilities/test_suite/common.py b/utilities/test_suite/common.py index 699495b39..a0f37ffa2 100644 --- a/utilities/test_suite/common.py +++ b/utilities/test_suite/common.py @@ -27,6 +27,7 @@ import sys import datetime import shutil +import pandas as pd try: from errno import FileExistsError @@ -179,7 +180,7 @@ def case_file_check(CASE_FILE_PATH, TYPE, TENSOR_TYPE_LIST, new_file, d_counter) def directory_name_generator(qaMode, affinity, layoutType, case, path, func_group_finder): if qaMode == 0: functionality_group = func_group_finder(int(case)) - dst_folder_temp = f"{path}/rpp_{affinity}_{layoutType}_{functionality_group}" + dst_folder_temp = path + "/rpp_" + affinity + "_" + layoutType + "_" + functionality_group else: dst_folder_temp = path @@ -360,3 +361,22 @@ def func_group_finder(case_number): if case_number in value: return key return "miscellaneous" + +def dataframe_to_markdown(df): + # Calculate the maximum width of each column + column_widths = {} + for col in df.columns: + max_length = len(col) + for value in df[col]: + max_length = max(max_length, len(str(value))) + column_widths[col] = max_length + + # Create the header row + md = '| ' + ' | '.join([col.ljust(column_widths[col]) for col in df.columns]) + ' |\n' + md += '| ' + ' | '.join(['-' * column_widths[col] for col in df.columns]) + ' |\n' + + # Create the data rows + for i, row in df.iterrows(): + md += '| ' + ' | '.join([str(value).ljust(column_widths[df.columns[j]]) for j, value in enumerate(row.values)]) + ' |\n' + + return md diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h index 71ca9fb34..eddf78702 100644 --- a/utilities/test_suite/rpp_test_suite_common.h +++ b/utilities/test_suite/rpp_test_suite_common.h @@ -75,11 +75,13 @@ std::map augmentationMap = {1, "gamma_correction"}, {2, "blend"}, {4, "contrast"}, + {6, "jitter"}, {8, "noise"}, {13, "exposure"}, {20, "flip"}, {21, "resize"}, {23, "rotate"}, + {24, "warp_afffine"}, {26, "lens_correction"}, {29, "water"}, {30, "non_linear_blend"}, diff --git a/utilities/test_suite/rpp_test_suite_misc.h b/utilities/test_suite/rpp_test_suite_misc.h index 0a4197caa..9ef118c48 100644 --- a/utilities/test_suite/rpp_test_suite_misc.h +++ b/utilities/test_suite/rpp_test_suite_misc.h @@ -98,6 +98,7 @@ void fill_roi_values(Rpp32u nDim, Rpp32u batchSize, Rpp32u *roiTensor, bool qaMo case 3: { std::array roi = {0, 0, 0, 50, 50, 8}; + for(int i = 0, j = 0; i < batchSize ; i++, j += 6) std::copy(roi.begin(), roi.end(), &roiTensor[j]); break; exit(0);