diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in index 72eac3392..a4621a9ab 100644 --- a/docs/sphinx/requirements.in +++ b/docs/sphinx/requirements.in @@ -1 +1 @@ -rocm-docs-core[api_reference]==1.7.1 +rocm-docs-core[api_reference]==1.7.2 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 6de43cd87..236b5c21d 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -92,7 +92,7 @@ requests==2.32.3 # via # pygithub # sphinx -rocm-docs-core[api-reference]==1.7.1 +rocm-docs-core[api-reference]==1.7.2 # via -r requirements.in smmap==5.0.1 # via gitdb diff --git a/include/rppdefs.h b/include/rppdefs.h index caf8f4487..2c0430ebc 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -52,6 +52,8 @@ SOFTWARE. #define RPP_MAX_8U ( 255 ) /*! \brief RPP maximum dimensions in tensor \ingroup group_rppdefs \page subpage_rppt */ #define RPPT_MAX_DIMS ( 5 ) +/*! \brief RPP maximum channels in audio tensor \ingroup group_rppdefs \page subpage_rppt */ +#define RPPT_MAX_AUDIO_CHANNELS ( 16 ) #define CHECK_RETURN_STATUS(x) do { \ int retval = (x); \ @@ -62,6 +64,7 @@ SOFTWARE. } while (0) #ifdef HIP_COMPILE +#include #define RPP_HOST_DEVICE __host__ __device__ #else #define RPP_HOST_DEVICE @@ -708,7 +711,7 @@ typedef struct GenericFilter */ typedef struct RpptResamplingWindow { - inline void input_range(Rpp32f x, Rpp32s *loc0, Rpp32s *loc1) + inline RPP_HOST_DEVICE void input_range(Rpp32f x, Rpp32s *loc0, Rpp32s *loc1) { Rpp32s xc = std::ceil(x); *loc0 = xc - lobes; @@ -742,7 +745,7 @@ typedef struct RpptResamplingWindow Rpp32f scale = 1, center = 1; Rpp32s lobes = 0, coeffs = 0; Rpp32s lookupSize = 0; - std::vector lookup; + Rpp32f *lookup = nullptr; __m128 pCenter, pScale; } RpptResamplingWindow; @@ -806,6 +809,43 @@ struct SlaneyMelScale : public BaseMelScale public: ~SlaneyMelScale() {}; }; +inline Rpp32f sinc(Rpp32f x) +{ + x *= M_PI; + return (std::abs(x) < 1e-5f) ? (1.f - (x * x * 0.16666667)) : std::sin(x) / x; +} + +inline Rpp64f hann(Rpp64f x) +{ + return 0.5 * (1 + std::cos(x * M_PI)); +} + +// initialization function used for filling the values in Resampling window (RpptResamplingWindow) +// using the coeffs and lobes value this function generates a LUT (look up table) which is further used in Resample audio augmentation +inline void windowed_sinc(RpptResamplingWindow &window, Rpp32s coeffs, Rpp32s lobes) +{ + Rpp32f scale = 2.0f * lobes / (coeffs - 1); + Rpp32f scale_envelope = 2.0f / coeffs; + window.coeffs = coeffs; + window.lobes = lobes; + window.lookupSize = coeffs + 5; +#ifdef GPU_SUPPORT + CHECK_RETURN_STATUS(hipHostMalloc(&(window.lookup), window.lookupSize * sizeof(Rpp32f))); +#else + window.lookup = static_cast(malloc(window.lookupSize * sizeof(Rpp32f))); +#endif + Rpp32s center = (coeffs - 1) * 0.5f; + for (int i = 0; i < coeffs; i++) { + Rpp32f x = (i - center) * scale; + Rpp32f y = (i - center) * scale_envelope; + Rpp32f w = sinc(x) * hann(y); + window.lookup[i + 1] = w; + } + window.center = center + 1; + window.scale = 1 / scale; + window.pCenter = _mm_set1_ps(window.center); + window.pScale = _mm_set1_ps(window.scale); +} /******************** HOST memory typedefs ********************/ diff --git a/include/rppt_tensor_audio_augmentations.h b/include/rppt_tensor_audio_augmentations.h index 52b8cea38..0401fec14 100644 --- a/include/rppt_tensor_audio_augmentations.h +++ b/include/rppt_tensor_audio_augmentations.h @@ -49,7 +49,7 @@ extern "C" { \n Finds the starting index and length of non silent region in the audio buffer by comparing the calculated short-term power with cutoff value passed * \param [in] srcPtr source tensor in HOST memory - * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32) * \param [in] srcLengthTensor source audio buffer length (1D tensor in HOST memory, of size batchSize) * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in HOST memory, of size batchSize) * \param [out] detectionLengthTensor length of non silent region (1D tensor in HOST memory, of size batchSize) @@ -70,7 +70,7 @@ RppStatus rppt_non_silent_region_detection_host(RppPtr_t srcPtr, RpptDescPtr src \n Finds the starting index and length of non silent region in the audio buffer by comparing the calculated short-term power with cutoff value passed * \param [in] srcPtr source tensor in HIP memory - * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) + * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2, offsetInBytes >= 0, dataType = F32) * \param [in] srcLengthTensor source audio buffer length (1D tensor in Pinned/HIP memory, of size batchSize) * \param [out] detectedIndexTensor beginning index of non silent region (1D tensor in Pinned/HIP memory, of size batchSize) * \param [out] detectionLengthTensor length of non silent region (1D tensor in Pinned/HIP memory, of size batchSize) @@ -254,9 +254,9 @@ RppStatus rppt_mel_filter_bank_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP /*! \brief Resample augmentation on HOST backend * \details Resample augmentation for audio data * \param [in] srcPtr source tensor in HOST memory -* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32) * \param [out] dstPtr destination tensor in HOST memory -* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 3, offsetInBytes >= 0, dataType = F32) +* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32) * \param [in] inRate Input sampling rate (1D tensor in HOST memory, of size batchSize) * \param [in] outRate Output sampling rate (1D tensor in HOST memory, of size batchSize) * \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in HOST memory, of size batchSize * 2) @@ -268,6 +268,25 @@ RppStatus rppt_mel_filter_bank_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppP */ RppStatus rppt_resample_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *inRateTensor, Rpp32f *outRateTensor, Rpp32s *srcDimsTensor, RpptResamplingWindow &window, rppHandle_t rppHandle); +#ifdef GPU_SUPPORT +/*! \brief Resample augmentation on HIP backend +* \details Resample augmentation for audio data +* \param [in] srcPtr source tensor in HIP memory +* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32) +* \param [out] dstPtr destination tensor in HIP memory +* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 2 or 3 (for single-channel or multi-channel audio tensor), offsetInBytes >= 0, dataType = F32) +* \param [in] inRate Input sampling rate (1D tensor in Pinned memory, of size batchSize) +* \param [in] outRate Output sampling rate (1D tensor in Pinned memory, of size batchSize) +* \param [in] srcDimsTensor source audio buffer length and number of channels (1D tensor in Pinned memory, of size batchSize * 2) +* \param [in] window Resampling window (struct of type RpptRpptResamplingWindow in HIP/Pinned memory) +* \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() +* \return A \ref RppStatus enumeration. +* \retval RPP_SUCCESS Successful completion. +* \retval RPP_ERROR* Unsuccessful completion. +*/ +RppStatus rppt_resample_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *inRateTensor, Rpp32f *outRateTensor, Rpp32s *srcDimsTensor, RpptResamplingWindow &window, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/include/rppt_tensor_effects_augmentations.h b/include/rppt_tensor_effects_augmentations.h index bd046c535..eaf5d84a5 100644 --- a/include/rppt_tensor_effects_augmentations.h +++ b/include/rppt_tensor_effects_augmentations.h @@ -470,7 +470,7 @@ RppStatus rppt_vignette_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t ds * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input - * \image html effects_augmentations_jitter_img150x150.png Sample Output + * \image html effects_augmentations_jitter_150x150.png Sample Output * \param [in] srcPtr source tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory @@ -491,7 +491,7 @@ RppStatus rppt_jitter_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. * \image html img150x150.png Sample Input - * \image html effects_augmentations_jitter_img150x150.png Sample Output + * \image html effects_augmentations_jitter_150x150.png Sample Output * \param [in] srcPtr source tensor in HIP memory * \param un[in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory @@ -576,8 +576,8 @@ RppStatus rppt_erase_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \details This function erases one or more user defined regions from an image, for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * dstPtr depth ranges - Will be same depth as srcPtr. - * \image html img150x150.jpg Sample Input - * \image html effects_augmentations_erase_img150x150.jpg Sample Output + * \image html img150x150.png Sample Input + * \image html effects_augmentations_erase_img150x150.png Sample Output * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory @@ -601,8 +601,8 @@ RppStatus rppt_erase_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPt * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. - * \image html img150x150.jpg Sample Input - * \image html effects_augmentations_glitch_img150x150.jpg Sample Output + * \image html img150x150.png Sample Input + * \image html effects_augmentations_glitch_img150x150.png Sample Output * \param [in] srcPtr source tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory @@ -623,8 +623,8 @@ RppStatus rppt_glitch_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dst * \details The glitch augmentation adds a glitch effect for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. - * \image html img150x150.jpg Sample Input - * \image html effects_augmentations_glitch_img150x150.jpg Sample Output + * \image html img150x150.png Sample Input + * \image html effects_augmentations_glitch_img150x150.png Sample Output * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory @@ -645,8 +645,8 @@ RppStatus rppt_glitch_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \details The pixelate augmentation performs a pixelate transformation for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. - * \image html img150x150.jpg Sample Input - * \image html effects_augmentations_pixelate_img150x150.jpg Sample Output + * \image html img150x150.png Sample Input + * \image html effects_augmentations_pixelate_img150x150.png Sample Output * \param [in] srcPtr source tensor in HOST memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HOST memory @@ -667,8 +667,8 @@ RppStatus rppt_pixelate_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t d * \details The pixelate augmentation performs a pixelate transformation for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. - * \image html img150x150.jpg Sample Input - * \image html effects_augmentations_pixelate_img150x150.jpg Sample Output + * \image html img150x150.png Sample Input + * \image html effects_augmentations_pixelate_img150x150.png Sample Output * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory diff --git a/include/rppt_tensor_geometric_augmentations.h b/include/rppt_tensor_geometric_augmentations.h index 28dd516e6..986c36a03 100644 --- a/include/rppt_tensor_geometric_augmentations.h +++ b/include/rppt_tensor_geometric_augmentations.h @@ -613,8 +613,8 @@ RppStatus rppt_remap_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstP * \details Performs a remap operation using user specified remap tables for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout. For each image, the output(x,y) = input(mapx(x, y), mapy(x, y)) for every (x,y) in the destination image.
* - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127). * - dstPtr depth ranges - Will be same depth as srcPtr. - * \image html img150x150.jpg Sample Input - * \image html geometric_augmentations_remap_img150x150.jpg Sample Output + * \image html img150x150.png Sample Input + * \image html geometric_augmentations_remap_img150x150.png Sample Output * \param [in] srcPtr source tensor in HIP memory * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3) * \param [out] dstPtr destination tensor in HIP memory diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 973d728c6..899aee456 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -5452,12 +5452,6 @@ inline void compute_bicubic_coefficient(Rpp32f weight, Rpp32f &coeff) coeff = (x >= 2) ? 0 : ((x > 1) ? (x * x * (-0.5f * x + 2.5f) - 4.0f * x + 2.0f) : (x * x * (1.5f * x - 2.5f) + 1.0f)); } -inline Rpp32f sinc(Rpp32f x) -{ - x *= M_PI; - return (std::abs(x) < 1e-5f) ? (1.0f - x * x * ONE_OVER_6) : std::sin(x) / x; -} - inline void compute_lanczos3_coefficient(Rpp32f weight, Rpp32f &coeff) { coeff = fabs(weight) >= 3 ? 0.0f : (sinc(weight) * sinc(weight * 0.333333f)); diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 721800c80..d83506b9f 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -156,6 +156,11 @@ struct RPPTensorFunctionMetaData (byte & 0x02 ? '1' : '0'), \ (byte & 0x01 ? '1' : '0') +// float4 floor + +#define FLOOR4(src, dst) \ +dst = make_int4(floorf(src.x), floorf(src.y), floorf(src.z), floorf(src.w)); + /******************** HOST FUNCTIONS ********************/ inline int getplnpkdind(RppiChnFormat &format) diff --git a/src/modules/hip/handlehip.cpp b/src/modules/hip/handlehip.cpp index 883c78316..9fb6a992f 100644 --- a/src/modules/hip/handlehip.cpp +++ b/src/modules/hip/handlehip.cpp @@ -245,7 +245,7 @@ struct HandleImpl - 293 is the size required for storing reduction outputs for 600000 size sample - 128 is the size required for storing cutOffDB values for batch size 128 */ hipMalloc(&(this->initHandle->mem.mgpu.scratchBufferHip.floatmem), sizeof(Rpp32f) * 76853888); - hipHostMalloc(&(this->initHandle->mem.mgpu.scratchBufferPinned.floatmem), sizeof(Rpp32f) * 8294400); + hipHostMalloc(&(this->initHandle->mem.mgpu.scratchBufferPinned.floatmem), sizeof(Rpp32f) * 8294400); // 3840 x 2160 } }; diff --git a/src/modules/hip/hip_tensor_audio_augmentations.hpp b/src/modules/hip/hip_tensor_audio_augmentations.hpp index cbf057dc6..f97212d96 100644 --- a/src/modules/hip/hip_tensor_audio_augmentations.hpp +++ b/src/modules/hip/hip_tensor_audio_augmentations.hpp @@ -30,5 +30,6 @@ SOFTWARE. #include "kernel/mel_filter_bank.hpp" #include "kernel/pre_emphasis_filter.hpp" #include "kernel/to_decibels.hpp" +#include "kernel/resample.hpp" #endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP diff --git a/src/modules/hip/kernel/resample.hpp b/src/modules/hip/kernel/resample.hpp new file mode 100644 index 000000000..fcdf5064b --- /dev/null +++ b/src/modules/hip/kernel/resample.hpp @@ -0,0 +1,299 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 0 - resample kernel device helpers -------------------- + +__device__ __forceinline__ float resample_hip_compute(float &x, float &scale, float ¢er, float *lookup, int &lookupSize) +{ + float locRaw = x * scale + center; + int locFloor = std::floor(locRaw); + float weight = locRaw - locFloor; + locFloor = std::max(std::min(locFloor, lookupSize - 2), 0); + float current = lookup[locFloor]; + float next = lookup[locFloor + 1]; + return current + weight * (next - current); +} + +__device__ __forceinline__ void resample_hip_compute(float4 *src_f4, float4 *dst_f4, const float4 *scale_f4, const float4 *center_f4, float *lookup) +{ + float4 locRaw_f4 = (*src_f4) * (*scale_f4) + (*center_f4); + int4 locFloor_i4; + FLOOR4(locRaw_f4, locFloor_i4); + float4 weight_f4 = make_float4(locRaw_f4.x - locFloor_i4.x, locRaw_f4.y - locFloor_i4.y, locRaw_f4.z - locFloor_i4.z, locRaw_f4.w - locFloor_i4.w); + float4 current_f4 = make_float4(lookup[locFloor_i4.x], lookup[locFloor_i4.y], lookup[locFloor_i4.z], lookup[locFloor_i4.w]); + float4 next_f4 = make_float4(lookup[locFloor_i4.x + 1], lookup[locFloor_i4.y + 1], lookup[locFloor_i4.z + 1], lookup[locFloor_i4.w + 1]); + *dst_f4 = current_f4 + weight_f4 * (next_f4 - current_f4); +} + +// -------------------- Set 1 - resample kernel host helpers -------------------- + +inline void compute_output_dims(Rpp32f *inRateTensor, + Rpp32f *outRateTensor, + Rpp32s *srcLengthTensor, + Rpp32s *dstLengthTensor, + Rpp32u batchSize) +{ + for (Rpp32s i = 0, j = 0; i < batchSize; i++, j += 2) + { + dstLengthTensor[j] = std::ceil(srcLengthTensor[j] * outRateTensor[i] / inRateTensor[i]); + dstLengthTensor[j + 1] = srcLengthTensor[j + 1]; + } +} + +// -------------------- Set 2 - resample kernels -------------------- + +__global__ void resample_single_channel_hip_tensor(float *srcPtr, + float *dstPtr, + uint2 strides, + int2 *srcDimsTensor, + int2 *dstDimsTensor, + float *inRateTensor, + float *outRateTensor, + RpptResamplingWindow *window) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + int srcLength = srcDimsTensor[id_z].x; + int dstLength = dstDimsTensor[id_z].x; + int outBlock = id_x * hipBlockDim_x; + int blockEnd = std::min(outBlock + static_cast(hipBlockDim_x), dstLength); + + if (dstLength != srcLength) + { + double scale = static_cast(inRateTensor[id_z]) / outRateTensor[id_z]; + extern __shared__ float lookup_smem[]; + + // copy all values from window lookup table to shared memory lookup table + for (int k = hipThreadIdx_x; k < window->lookupSize; k += hipBlockDim_x) + lookup_smem[k] = window->lookup[k]; + __syncthreads(); + + if (outBlock >= dstLength) + return; + + // extract the window scale, center and lookup size values from window + float windowScale = window->scale; + float windowCenter = window->center; + int lookupSize = window->lookupSize; + float4 windowScale_f4 = static_cast(windowScale); + float4 windowCenter_f4 = static_cast(windowCenter); + float4 increment_f4 = static_cast(8.0f); + d_float8 locInit_f8; + locInit_f8.f4[0] = make_float4(0, 1, 2, 3); + locInit_f8.f4[1] = make_float4(4, 5, 6, 7); + + // compute block wise values required for processing + double inBlockRaw = outBlock * scale; + int inBlockRounded = static_cast(inBlockRaw); + float inPos = inBlockRaw - inBlockRounded; + float fscale = scale; + uint dstIdx = id_z * strides.y + outBlock; + float *inBlockPtr = srcPtr + id_z * strides.x + inBlockRounded; + + // process block size (256) elements in single thread + for (int outPos = outBlock; outPos < blockEnd; outPos++, inPos += fscale, dstIdx++) + { + int loc0, loc1; + window->input_range(inPos, &loc0, &loc1); + + // check if computed loc0, loc1 values are beyond the input dimensions and update accordingly + if (loc0 + inBlockRounded < 0) + loc0 = -inBlockRounded; + if (loc1 + inBlockRounded > srcLength) + loc1 = srcLength - inBlockRounded; + int locInWindow = loc0; + float locBegin = locInWindow - inPos; + float accum = 0.0f; + + d_float8 locInWindow_f8, accum_f8; + locInWindow_f8.f4[0] = static_cast(locBegin) + locInit_f8.f4[0]; + locInWindow_f8.f4[1] = static_cast(locBegin) + locInit_f8.f4[1]; + accum_f8.f4[0] = static_cast(0.0f); + accum_f8.f4[1] = static_cast(0.0f); + for (; locInWindow + 7 < loc1; locInWindow += 8) + { + d_float8 weights_f8; + resample_hip_compute(&locInWindow_f8.f4[0], &weights_f8.f4[0], &windowScale_f4, &windowCenter_f4, lookup_smem); + resample_hip_compute(&locInWindow_f8.f4[1], &weights_f8.f4[1], &windowScale_f4, &windowCenter_f4, lookup_smem); + + d_float8 src_f8; + rpp_hip_load8_and_unpack_to_float8(inBlockPtr + locInWindow, &src_f8); + accum_f8.f4[0] += src_f8.f4[0] * weights_f8.f4[0]; + accum_f8.f4[1] += src_f8.f4[1] * weights_f8.f4[1]; + + locInWindow_f8.f4[0] += increment_f4; + locInWindow_f8.f4[1] += increment_f4; + } + accum_f8.f4[0] += accum_f8.f4[1]; + accum += (accum_f8.f1[0] + accum_f8.f1[1] + accum_f8.f1[2] + accum_f8.f1[3]); // perform small work of reducing float4 to float + + float x = locInWindow - inPos; + for (; locInWindow < loc1; locInWindow++, x++) + { + float w = resample_hip_compute(x, windowScale, windowCenter, lookup_smem, lookupSize); + accum += inBlockPtr[locInWindow] * w; + } + + // Final store to dst + dstPtr[dstIdx] = accum; + } + } + // copy input to output if dstLength is same as srcLength + else + { + if (outBlock >= dstLength) + return; + + uint srcIdx = id_z * strides.x + outBlock; + uint dstIdx = id_z * strides.y + outBlock; + for (int outPos = outBlock; outPos < blockEnd; outPos++, dstIdx++, srcIdx++) + dstPtr[dstIdx] = srcPtr[srcIdx]; + } +} + +__global__ void resample_multi_channel_hip_tensor(float *srcPtr, + float *dstPtr, + uint2 strides, + int2 *srcDimsTensor, + int2 *dstDimsTensor, + float *inRateTensor, + float *outRateTensor, + RpptResamplingWindow *window) +{ + int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + int srcLength = srcDimsTensor[id_z].x; + int numChannels = srcDimsTensor[id_z].y; + int dstLength = dstDimsTensor[id_z].x; + int outBlock = id_x * hipBlockDim_x; + int blockEnd = std::min(outBlock + static_cast(hipBlockDim_x), dstLength); + + if (dstLength != srcLength) + { + double scale = static_cast(inRateTensor[id_z]) / outRateTensor[id_z]; + extern __shared__ float lookup_smem[]; + + // copy all values from window lookup table to shared memory lookup table + for (int k = hipThreadIdx_x; k < window->lookupSize; k += hipBlockDim_x) + lookup_smem[k] = window->lookup[k]; + __syncthreads(); + + if (outBlock >= dstLength) + return; + + // extract the window scale, center and lookup size values from window + float windowScale = window->scale; + float windowCenter = window->center; + int lookupSize = window->lookupSize; + + // compute block wise values required for processing + double inBlockRaw = outBlock * scale; + int inBlockRounded = static_cast(inBlockRaw); + float inPos = inBlockRaw - inBlockRounded; + float fscale = scale; + uint dstIdx = id_z * strides.y + outBlock * numChannels; + float *inBlockPtr = srcPtr + id_z * strides.x + (inBlockRounded * numChannels); + + // process block size * channels (256 * channels) elements in single thread + for (int outPos = outBlock; outPos < blockEnd; outPos++, inPos += fscale, dstIdx += numChannels) + { + int loc0, loc1; + window->input_range(inPos, &loc0, &loc1); + + // check if computed loc0, loc1 values are beyond the input dimensions and update accordingly + if (loc0 + inBlockRounded < 0) + loc0 = -inBlockRounded; + if (loc1 + inBlockRounded > srcLength) + loc1 = srcLength - inBlockRounded; + float locInWindow = loc0 - inPos; + int2 offsetLocs_i2 = make_int2(loc0, loc1) * static_cast(numChannels); // offsetted loc0, loc1 values for multi channel case + + float accum[RPPT_MAX_AUDIO_CHANNELS] = {0.0f}; + for (int offsetLoc = offsetLocs_i2.x; offsetLoc < offsetLocs_i2.y; offsetLoc += numChannels, locInWindow++) + { + float w = resample_hip_compute(locInWindow, windowScale, windowCenter, lookup_smem, lookupSize); + for (int c = 0; c < numChannels; c++) + accum[c] += inBlockPtr[offsetLoc + c] * w; + } + + // Final store to dst + for (int c = 0; c < numChannels; c++) + dstPtr[dstIdx + c] = accum[c]; + } + } + else + { + if (outBlock >= dstLength) + return; + + uint srcIdx = id_z * strides.x + outBlock * numChannels; + uint dstIdx = id_z * strides.y + outBlock * numChannels; + for (int outPos = outBlock; outPos < blockEnd; outPos++, dstIdx += numChannels, srcIdx += numChannels) + { + for (int c = 0; c < numChannels; c++) + dstPtr[dstIdx + c] = srcPtr[srcIdx + c]; + } + } +} + +// -------------------- Set 3 - resample kernels executor -------------------- + +RppStatus hip_exec_resample_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *inRateTensor, + Rpp32f *outRateTensor, + Rpp32s *srcDimsTensor, + RpptResamplingWindow &window, + rpp::Handle& handle) +{ + Rpp32s globalThreads_x = dstDescPtr->strides.hStride; + Rpp32s globalThreads_y = 1; + Rpp32s globalThreads_z = dstDescPtr->n; + Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims + RppSize_t sharedMemorySizeInBytes = (window.lookupSize * sizeof(Rpp32f)); // shared memory size needed for resample kernel + + // using the input sampling rate, output sampling rate compute the output dims + Rpp32s *dstDimsTensor = reinterpret_cast(handle.GetInitHandle()->mem.mgpu.scratchBufferPinned.floatmem); + compute_output_dims(inRateTensor, outRateTensor, srcDimsTensor, dstDimsTensor, dstDescPtr->n); + + // For 1D audio tensors (channels = 1) + if (tensorDims == 1) + { + hipLaunchKernelGGL(resample_single_channel_hip_tensor, + dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + sharedMemorySizeInBytes, + handle.GetStream(), + srcPtr, + dstPtr, + make_uint2(srcDescPtr->strides.nStride, dstDescPtr->strides.nStride), + reinterpret_cast(srcDimsTensor), + reinterpret_cast(dstDimsTensor), + inRateTensor, + outRateTensor, + &window); + } + // For 2D audio tensors (channels > 1) + else if (tensorDims == 2) + { + hipLaunchKernelGGL(resample_multi_channel_hip_tensor, + dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + sharedMemorySizeInBytes, + handle.GetStream(), + srcPtr, + dstPtr, + make_uint2(srcDescPtr->strides.nStride, dstDescPtr->strides.nStride), + reinterpret_cast(srcDimsTensor), + reinterpret_cast(dstDimsTensor), + inRateTensor, + outRateTensor, + &window); + } + + return RPP_SUCCESS; +} diff --git a/src/modules/rppt_tensor_audio_augmentations.cpp b/src/modules/rppt_tensor_audio_augmentations.cpp index 673f8d6de..536f537a3 100644 --- a/src/modules/rppt_tensor_audio_augmentations.cpp +++ b/src/modules/rppt_tensor_audio_augmentations.cpp @@ -298,9 +298,12 @@ RppStatus rppt_non_silent_region_detection_gpu(RppPtr_t srcPtr, rppHandle_t rppHandle) { #ifdef HIP_COMPILE + Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims + if (tensorDims != 1) + return RPP_ERROR_INVALID_SRC_DIMS; + if (srcDescPtr->dataType == RpptDataType::F32) { - return hip_exec_non_silent_region_detection_tensor(static_cast(srcPtr), srcDescPtr, srcLengthTensor, @@ -480,7 +483,47 @@ RppStatus rppt_mel_filter_bank_gpu(RppPtr_t srcPtr, return RPP_ERROR_NOT_IMPLEMENTED; } - #elif defined(OCL_COMPILE) +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + +/******************** resample ********************/ + +RppStatus rppt_resample_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *inRateTensor, + Rpp32f *outRateTensor, + Rpp32s *srcDimsTensor, + RpptResamplingWindow &window, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + Rpp32u tensorDims = srcDescPtr->numDims - 1; // exclude batchsize from input dims + if (tensorDims != 1 && tensorDims != 2) + return RPP_ERROR_INVALID_SRC_DIMS; + + if (srcDescPtr->dataType == RpptDataType::F32) + { + hip_exec_resample_tensor(static_cast(srcPtr), + srcDescPtr, + static_cast(dstPtr), + dstDescPtr, + inRateTensor, + outRateTensor, + srcDimsTensor, + window, + rpp::deref(rppHandle)); + return RPP_SUCCESS; + } + else + { + return RPP_ERROR_NOT_IMPLEMENTED; + } + +#elif defined(OCL_COMPILE) return RPP_ERROR_NOT_IMPLEMENTED; #endif // backend } diff --git a/src/modules/rppt_tensor_effects_augmentations.cpp b/src/modules/rppt_tensor_effects_augmentations.cpp index 789767735..d17f6dd93 100644 --- a/src/modules/rppt_tensor_effects_augmentations.cpp +++ b/src/modules/rppt_tensor_effects_augmentations.cpp @@ -2181,7 +2181,7 @@ RppStatus rppt_pixelate_gpu(RppPtr_t srcPtr, interDesc = *srcDescPtr; RpptDescPtr interDescPtr = &interDesc; - RpptImagePatchPtr internalDstImgSizes = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferHip.floatmem); + RpptImagePatchPtr internalDstImgSizes = reinterpret_cast(rpp::deref(rppHandle).GetInitHandle()->mem.mgpu.scratchBufferPinned.floatmem); RpptROI *internalRoiTensorPtrSrc = reinterpret_cast(internalDstImgSizes + dstDescPtr->n); for (int i = 0; i < srcDescPtr->n; i++) diff --git a/utilities/test_suite/HIP/Tensor_audio_hip.cpp b/utilities/test_suite/HIP/Tensor_audio_hip.cpp index 9d42680fe..6076d9238 100644 --- a/utilities/test_suite/HIP/Tensor_audio_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_audio_hip.cpp @@ -30,8 +30,8 @@ int main(int argc, char **argv) const int MIN_ARG_COUNT = 7; if (argc < MIN_ARG_COUNT) { - printf("\nImproper Usage! Needs all arguments!\n"); - printf("\nUsage: ./Tensor_audio_hip \n"); + cout << "\nImproper Usage! Needs all arguments!\n"; + cout << "\nUsage: ./Tensor_audio_hip \n"; return -1; } @@ -55,7 +55,7 @@ int main(int argc, char **argv) if (funcName.empty()) { if (testType == 0) - printf("\ncase %d is not supported\n", testCase); + cout << "\ncase " << testCase << " is not supported\n"; return -1; } @@ -111,7 +111,6 @@ int main(int argc, char **argv) maxDstChannels = 1; } set_audio_descriptor_dims_and_strides(dstDescPtr, batchSize, maxDstHeight, maxDstWidth, maxDstChannels, offsetInBytes); - // set buffer sizes for src/dst if(testCase == 7) { @@ -124,9 +123,14 @@ int main(int argc, char **argv) oBufferSize = (Rpp64u)dstDescPtr->h * (Rpp64u)dstDescPtr->w * (Rpp64u)dstDescPtr->c * (Rpp64u)dstDescPtr->n; } + // compute maximum possible buffer size of resample + unsigned long long resampleMaxBufferSize = dstDescPtr->n * dstDescPtr->strides.nStride * 1.15; + if (testCase == 6) + oBufferSize = resampleMaxBufferSize; + // allocate hip buffers for input & output - Rpp32f *inputf32 = (Rpp32f *)calloc(iBufferSize, sizeof(Rpp32f)); - Rpp32f *outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32 = static_cast(calloc(iBufferSize, sizeof(Rpp32f))); + Rpp32f *outputf32 = static_cast(calloc(oBufferSize, sizeof(Rpp32f))); void *d_inputf32, *d_outputf32; CHECK_RETURN_STATUS(hipMalloc(&d_inputf32, iBufferSize * sizeof(Rpp32f))); @@ -142,6 +146,10 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostMalloc(&srcDims, batchSize * sizeof(RpptImagePatch))); CHECK_RETURN_STATUS(hipHostMalloc(&dstDims, batchSize * sizeof(RpptImagePatch))); + // allocate the buffer for srcDimsTensor + Rpp32s *srcDimsTensor; + CHECK_RETURN_STATUS(hipHostMalloc(&srcDimsTensor, batchSize * 2 * sizeof(Rpp32s))); + Rpp32s *detectedIndex = nullptr, *detectionLength = nullptr; if(testCase == 0) { @@ -149,10 +157,16 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostMalloc(&detectionLength, batchSize * sizeof(Rpp32f))); } - // allocate the buffer for srcDimsTensor - Rpp32s *srcDimsTensor; - CHECK_RETURN_STATUS(hipHostMalloc(&srcDimsTensor, batchSize * 2 * sizeof(Rpp32s))); - Rpp32f *coeff; + // declare pointer of type RpptResamplingWindow used for resample augmentation + Rpp32f *inRateTensor = nullptr, *outRateTensor = nullptr; + RpptResamplingWindow *window = nullptr; + if (testCase == 6) + { + CHECK_RETURN_STATUS(hipHostMalloc(&inRateTensor, batchSize * sizeof(Rpp32f))); + CHECK_RETURN_STATUS(hipHostMalloc(&outRateTensor, batchSize * sizeof(Rpp32f))); + } + + Rpp32f *coeff = nullptr; if(testCase == 2) CHECK_RETURN_STATUS(hipHostMalloc(&coeff, batchSize * sizeof(Rpp32f))); @@ -165,7 +179,7 @@ int main(int argc, char **argv) int noOfIterations = static_cast(audioNames.size()) / batchSize; double maxWallTime = 0, minWallTime = 500, avgWallTime = 0; string testCaseName; - printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); + cout << "\nRunning " << func << " " << numRuns << " times (each time with a batch size of " << batchSize << " images) and computing mean statistics..."; for (int iterCount = 0; iterCount < noOfIterations; iterCount++) { // read and decode audio and fill the audio dim values @@ -242,6 +256,47 @@ int main(int argc, char **argv) break; } + case 6: + { + testCaseName = "resample"; + + maxDstWidth = 0; + for(int i = 0, j = 0; i < batchSize; i++, j += 2) + { + inRateTensor[i] = 16000; + outRateTensor[i] = 16000 * 1.15f; + Rpp32f scaleRatio = outRateTensor[i] / inRateTensor[i]; + srcDimsTensor[j] = srcLengthTensor[i]; + srcDimsTensor[j + 1] = channelsTensor[i]; + dstDims[i].width = static_cast(std::ceil(scaleRatio * srcLengthTensor[i])); + dstDims[i].height = 1; + maxDstWidth = std::max(maxDstWidth, static_cast(dstDims[i].width)); + } + Rpp32f quality = 50.0f; + Rpp32s lobes = std::round(0.007 * quality * quality - 0.09 * quality + 3); + Rpp32s lookupSize = lobes * 64 + 1; + if (window == nullptr) + { + CHECK_RETURN_STATUS(hipHostMalloc(&window, sizeof(RpptResamplingWindow))); + windowed_sinc(*window, lookupSize, lobes); + } + + dstDescPtr->w = maxDstWidth; + dstDescPtr->strides.nStride = dstDescPtr->c * dstDescPtr->w * dstDescPtr->h; + + // check if the required output buffer size is greater than predefined resampleMaxBufferSize + if (dstDescPtr->n * dstDescPtr->strides.nStride > resampleMaxBufferSize) + { + std::cout << "\nError! Requested resample output size is greater than predefined max size for resample in test suite." + "\nPlease modify resampleMaxBufferSize value in test suite as per your requirements for running resample kernel" << std::endl; + exit(0); + } + + startWallTime = omp_get_wtime(); + rppt_resample_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, inRateTensor, outRateTensor, srcDimsTensor, *window, handle); + + break; + } case 7: { testCaseName = "mel_filter_bank"; @@ -279,7 +334,7 @@ int main(int argc, char **argv) endWallTime = omp_get_wtime(); if (missingFuncFlag == 1) { - printf("\nThe functionality %s doesn't yet exist in RPP\n", func.c_str()); + cout << "\nThe functionality " << func << " doesn't yet exist in RPP\n"; return -1; } @@ -339,7 +394,7 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipFree(d_outputf32)); CHECK_RETURN_STATUS(hipHostFree(srcLengthTensor)); CHECK_RETURN_STATUS(hipHostFree(channelsTensor)); - if(testCase == 2) + if(coeff != nullptr) CHECK_RETURN_STATUS(hipHostFree(coeff)); CHECK_RETURN_STATUS(hipHostFree(srcDims)); CHECK_RETURN_STATUS(hipHostFree(dstDims)); @@ -348,5 +403,16 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostFree(detectedIndex)); if (detectionLength != nullptr) CHECK_RETURN_STATUS(hipHostFree(detectionLength)); + if (window != nullptr) + { + if (window->lookup != nullptr) + CHECK_RETURN_STATUS(hipHostFree(window->lookup)); + CHECK_RETURN_STATUS(hipHostFree(window)); + } + if (inRateTensor != nullptr) + CHECK_RETURN_STATUS(hipHostFree(inRateTensor)); + if (outRateTensor != nullptr) + CHECK_RETURN_STATUS(hipHostFree(outRateTensor)); + return 0; } diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp index 685c21203..bfdc008f4 100644 --- a/utilities/test_suite/HIP/Tensor_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_hip.cpp @@ -80,26 +80,26 @@ int main(int argc, char **argv) if (verbosity == 1) { - printf("\nInputs for this test case are:"); - printf("\nsrc1 = %s", argv[1]); - printf("\nsrc2 = %s", argv[2]); + cout << "\nInputs for this test case are:"; + cout << "\nsrc1 = " << argv[1]; + cout << "\nsrc2 = " << argv[2]; if (testType == 0) - printf("\ndst = %s", argv[3]); - printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); - printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); - printf("\ncase number (0:91) = %s", argv[6]); - printf("\nnumber of times to run = %s", argv[8]); - printf("\ntest type - (0 = unit tests / 1 = performance tests) = %s", argv[9]); - printf("\nlayout type - (0 = PKD3/ 1 = PLN3/ 2 = PLN1) = %s", argv[10]); - printf("\nqa mode - 0/1 = %s", argv[12]); - printf("\ndecoder type - (0 = TurboJPEG / 1 = OpenCV) = %s", argv[13]); - printf("\nbatch size = %s", argv[14]); + cout << "\ndst = " << argv[3]; + cout << "\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = " << argv[4]; + cout << "\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = " << argv[5]; + cout << "\ncase number (0:91) = " << argv[6]; + cout << "\nnumber of times to run = " << argv[8]; + cout << "\ntest type - (0 = unit tests / 1 = performance tests) = " << argv[9]; + cout << "\nlayout type - (0 = PKD3/ 1 = PLN3/ 2 = PLN1) = " << argv[10]; + cout << "\nqa mode - 0/1 = " << argv[12]; + cout << "\ndecoder type - (0 = TurboJPEG / 1 = OpenCV) = " << argv[13]; + cout << "\nbatch size = " << argv[14]; } if (argc < MIN_ARG_COUNT) { - printf("\nImproper Usage! Needs all arguments!\n"); - printf("\nUsage: f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> 0> < qa mode (0/1)> 1> >\n"); + cout << "\nImproper Usage! Needs all arguments!\n"; + cout << "\nUsage: f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> 0> 1> >\n"; return -1; } @@ -107,24 +107,24 @@ int main(int argc, char **argv) { if(testCase == 36 || testCase == 31 || testCase == 35 || testCase == 45 || testCase == 86) { - printf("\ncase %d does not exist for PLN1 layout\n", testCase); + cout << "\ncase " << testCase << " does not exist for PLN1 layout\n"; return -1; } else if (outputFormatToggle != 0) { - printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n"); + cout << "\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n"; return -1; } } if(pln1OutTypeCase && outputFormatToggle != 0) { - printf("\ntest case %d don't have outputFormatToggle! Please input outputFormatToggle = 0\n", testCase); + cout << "\ntest case " << testCase << " don't have outputFormatToggle! Please input outputFormatToggle = 0\n"; return -1; } else if (reductionTypeCase && outputFormatToggle != 0) { - printf("\nReduction Kernels don't have outputFormatToggle! Please input outputFormatToggle = 0\n"); + cout << "\nReduction Kernels don't have outputFormatToggle! Please input outputFormatToggle = 0\n"; return -1; } else if(batchSize > MAX_BATCH_SIZE) @@ -143,7 +143,7 @@ int main(int argc, char **argv) if (funcName.empty()) { if (testType == 0) - printf("\ncase %d is not supported\n", testCase); + cout << "\ncase " << testCase << " is not supported\n"; return -1; } @@ -420,7 +420,7 @@ int main(int argc, char **argv) CHECK_RETURN_STATUS(hipHostMalloc(&d_interDstPtr, srcDescPtr->strides.nStride * srcDescPtr->n * sizeof(Rpp32f))); // case-wise RPP API and measure time script for Unit and Performance test - printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize); + cout << "\nRunning " << func << " " << numRuns << " times (each time with a batch size of " << batchSize << " images) and computing mean statistics..."; for(int iterCount = 0; iterCount < noOfIterations; iterCount++) { vector::const_iterator imagesPathStart = imageNamesPath.begin() + (iterCount * batchSize); @@ -1462,7 +1462,7 @@ int main(int argc, char **argv) wallTime = endWallTime - startWallTime; if (missingFuncFlag == 1) { - printf("\nThe functionality %s doesn't yet exist in RPP\n", func.c_str()); + cout << "\nThe functionality " << func << " doesn't yet exist in RPP\n"; return -1; } @@ -1474,15 +1474,18 @@ int main(int argc, char **argv) if (testType == 0) { - cout << "\n\nGPU Backend Wall Time: " << wallTime <<" ms/batch"<< endl; + cout <<"\n\n"; + if(noOfIterations > 1) + cout <<"Execution Timings for Iteration "<< iterCount+1 <<":"<c == 3) - printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): "); + cout << "\nReduction result (Batch of 3 channel images produces 4 results per image in batch): "; else if(srcDescPtr->c == 1) { - printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): "); + cout << "\nReduction result (Batch of 1 channel images produces 1 result per image in batch): "; reductionFuncResultArrLength = srcDescPtr->n; } @@ -1511,7 +1514,7 @@ int main(int argc, char **argv) else print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision); } - printf("\n"); + cout << "\n"; /*Compare the output of the function with golden outputs only if 1.QA Flag is set diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp index cb0d53b34..b4fa560e8 100644 --- a/utilities/test_suite/HIP/Tensor_misc_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp @@ -30,8 +30,8 @@ int main(int argc, char **argv) const int MIN_ARG_COUNT = 9; if (argc < MIN_ARG_COUNT) { - printf("\nImproper Usage! Needs all arguments!\n"); - printf("\nUsage: ./Tensor_misc_hip