diff --git a/include/rppt_tensor_statistical_operations.h b/include/rppt_tensor_statistical_operations.h
index b61af9dde..3cb49a82b 100644
--- a/include/rppt_tensor_statistical_operations.h
+++ b/include/rppt_tensor_statistical_operations.h
@@ -78,6 +78,78 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t
RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t tensorSumArr, Rpp32u tensorSumArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Tensor min operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HOST memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t minArr, Rpp32u minArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor min operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor min is a reduction operation that finds the channel-wise (R min / G min / B min) and overall min for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] minArr destination array in HIP memory
+ * \param [in] minArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMinArr, Rpp32u imageMinArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Tensor max operation on HOST backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HOST memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t maxArr, Rpp32u maxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Tensor max operation on HIP backend for a NCHW/NHWC layout tensor
+ * \details The tensor max is a reduction operation that finds the channel-wise (R max / G max / B max) and overall max for each image in a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * - srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * - dstPtr depth ranges - Will be same depth as srcPtr.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] maxArr destination array in HIP memory
+ * \param [in] maxArrLength length of provided destination array (Restrictions - if srcDescPtr->c == 1 then tensorSumArrLength >= srcDescPtr->n, and if srcDescPtr->c == 3 then tensorSumArrLength >= srcDescPtr->n * 4)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) | (Restrictions - roiTensorSrc[i].xywhROI.roiWidth <= 3840 and roiTensorSrc[i].xywhROI.roiHeight <= 2160)
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t imageMaxArr, Rpp32u imageMaxArrLength, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 86391108d..1e748cc86 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -5996,4 +5996,284 @@ inline void compute_sum_24_host(__m256d *p, __m256d *pSumR, __m256d *pSumG, __m2
pSumB[0] = _mm256_add_pd(_mm256_add_pd(p[4], p[5]), pSumB[0]); //add 8B values and bring it down to 4
}
-#endif //RPP_CPU_COMMON_H
\ No newline at end of file
+inline void reduce_min_32_host(__m256i *pMin, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px[0] = _mm256_castsi256_si128(pMin[0]);
+ px[1] = _mm256_extracti128_si256(pMin[0], 1);
+ px[0] = _mm_min_epu8(px[0], px[1]);
+ px[1] = _mm_unpacklo_epi8(zero, px[0]);
+ px[0] = _mm_unpackhi_epi8(zero, px[0]);
+ px[0] = _mm_min_epu8(px[0], px[1]);
+ px[1] = _mm_unpacklo_epi16(zero, px[0]);
+ px[0] = _mm_unpackhi_epi16(zero, px[0]);
+ px[0] = _mm_min_epu16(px[0], px[1]);
+ px[1] = _mm_unpacklo_epi32(zero, px[0]);
+ px[0] = _mm_unpackhi_epi32(zero, px[0]);
+ px[0] = _mm_min_epu32(px[0], px[1]);
+ result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_min_96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+ pMinR[0] = _mm256_min_epu8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+ pMinG[0] = _mm256_min_epu8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+ pMinB[0] = _mm256_min_epu8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+ px[1] = _mm_min_epu8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_min_epu8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+ pMinR[0] = _mm_min_epu8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+ pMinG[0] = _mm_min_epu8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+ pMinB[0] = _mm_min_epu8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+ px[1] = _mm_min_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_32_host(__m256i *pMax, __m128i *result)
+{
+ __m128i px;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px = _mm_max_epu8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+ px = _mm_max_epu8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+ px = _mm_max_epu16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+ px = _mm_max_epu32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+ result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_max_96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+ pMaxR[0] = _mm256_max_epu8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+ pMaxG[0] = _mm256_max_epu8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+ pMaxB[0] = _mm256_max_epu8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+ px[1] = _mm_max_epu8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+ px[1] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_max_epu8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+ px[0] = _mm_max_epu8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_max_epu8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epu8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epu8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+ pMaxR[0] = _mm_max_epu8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+ pMaxG[0] = _mm_max_epu8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+ pMaxB[0] = _mm_max_epu8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_float8_host(__m256 *p1, __m256 *pMin)
+{
+ pMin[0] = _mm256_min_ps(p1[0], pMin[0]); //compare and store min of 8 values into global min
+}
+
+inline void reduce_min_float8_host(__m256 *pMin, __m128 *result)
+{
+ __m128 px;
+ px = _mm_min_ps(_mm256_castps256_ps128(pMin[0]), _mm256_extractf128_ps(pMin[0], 1));
+ px = _mm_min_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+ result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_min_float24_host(__m256 *p1, __m256 *pMinR, __m256 *pMinG, __m256 *pMinB)
+{
+ pMinR[0] = _mm256_min_ps(p1[0], pMinR[0]); //compare and store min of 8 R values into global min
+ pMinG[0] = _mm256_min_ps(p1[1], pMinG[0]); //compare and store min of 8 G values into global min
+ pMinB[0] = _mm256_min_ps(p1[2], pMinB[0]); //compare and store min of 8 B values into global min
+}
+
+inline void reduce_min_float24_host(__m256 *pMinR, __m256 *pMinG, __m256 *pMinB, __m256 *result) // TO CHANGE
+{
+ __m128 px[2];
+ px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinR[0]), _mm256_extractf128_ps(pMinR[0], 1));
+ px[1] = _mm_min_ps(_mm256_castps256_ps128(pMinG[0]), _mm256_extractf128_ps(pMinG[0], 1));
+ px[0] = _mm_min_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+ px[0] = _mm_permute_ps(px[0], 0b11011000);
+ result[0] = _mm256_castps128_ps256(px[0]);
+ px[0] = _mm_min_ps(_mm256_castps256_ps128(pMinB[0]), _mm256_extractf128_ps(pMinB[0], 1));
+ px[1] = _mm_min_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+ px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+ result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void compute_max_float8_host(__m256 *p1, __m256 *pMax)
+{
+ pMax[0] = _mm256_max_ps(p1[0], pMax[0]); //compare and store max of 8 values into global min
+}
+
+inline void reduce_max_float8_host(__m256 *pMax, __m128 *result)
+{
+ __m128 px;
+ px = _mm_max_ps(_mm256_castps256_ps128(pMax[0]), _mm256_extractf128_ps(pMax[0], 1));
+ px = _mm_max_ps(_mm_unpacklo_ps(xmm_p0, px), _mm_unpackhi_ps(xmm_p0, px));
+ result[0] = _mm_shuffle_ps(px, px, 39);
+}
+
+inline void compute_max_float24_host(__m256 *p1, __m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB)
+{
+ pMaxR[0] = _mm256_max_ps(p1[0], pMaxR[0]); //compare and store max of 8 R values into global min
+ pMaxG[0] = _mm256_max_ps(p1[1], pMaxG[0]); //compare and store max of 8 G values into global min
+ pMaxB[0] = _mm256_max_ps(p1[2], pMaxB[0]); //compare and store max of 8 B values into global min
+}
+
+inline void reduce_max_float24_host(__m256 *pMaxR, __m256 *pMaxG, __m256 *pMaxB, __m256 *result)
+{
+ __m128 px[2];
+ px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxR[0]), _mm256_extractf128_ps(pMaxR[0], 1));
+ px[1] = _mm_max_ps(_mm256_castps256_ps128(pMaxG[0]), _mm256_extractf128_ps(pMaxG[0], 1));
+ px[0] = _mm_max_ps(_mm_unpacklo_ps(px[0], px[1]), _mm_unpackhi_ps(px[0], px[1]));
+ px[0] = _mm_permute_ps(px[0], 0b11011000);
+ result[0] = _mm256_castps128_ps256(px[0]);
+ px[0] = _mm_max_ps(_mm256_castps256_ps128(pMaxB[0]), _mm256_extractf128_ps(pMaxB[0], 1));
+ px[1] = _mm_max_ps(_mm_unpacklo_ps(px[0], xmm_p0), _mm_unpackhi_ps(px[0], xmm_p0));
+ px[0] = _mm_shuffle_ps(px[1], px[1], 34);
+ result[0] = _mm256_insertf128_ps(result[0], px[0], 1);
+}
+
+inline void reduce_min_i32_host(__m256i *pMin, __m128i *result)
+{
+ __m128i px;
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px = _mm_min_epi8(_mm256_castsi256_si128(pMin[0]), _mm256_extracti128_si256(pMin[0], 1));
+ px = _mm_min_epi8(_mm_unpacklo_epi8(zero, px), _mm_unpackhi_epi8(zero, px));
+ px = _mm_min_epi16(_mm_unpacklo_epi16(zero, px), _mm_unpackhi_epi16(zero, px));
+ px = _mm_min_epi32(_mm_unpacklo_epi32(zero, px), _mm_unpackhi_epi32(zero, px));
+ result[0] = _mm_shuffle_epi8(px, mask);
+}
+
+inline void compute_min_i96_host(__m256i *p1, __m256i *pMinR, __m256i *pMinG, __m256i *pMinB)
+{
+ pMinR[0] = _mm256_min_epi8(p1[0], pMinR[0]); //compare and store min of 32 R values into global min
+ pMinG[0] = _mm256_min_epi8(p1[1], pMinG[0]); //compare and store min of 32 G values into global min
+ pMinB[0] = _mm256_min_epi8(p1[2], pMinB[0]); //compare and store min of 32 B values into global min
+}
+
+inline void reduce_min_i96_host(__m256i *pMinR, __m256i *pMinG, __m256i *pMinB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinR[0]), _mm256_extracti128_si256(pMinR[0], 1));
+ px[1] = _mm_min_epi8(_mm256_castsi256_si128(pMinG[0]), _mm256_extracti128_si256(pMinG[0], 1));
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_min_epi8(_mm256_castsi256_si128(pMinB[0]), _mm256_extracti128_si256(pMinB[0], 1));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_min_i48_host(__m128i *p1, __m128i *pMinR, __m128i *pMinG, __m128i *pMinB)
+{
+ pMinR[0] = _mm_min_epi8(p1[0], pMinR[0]); //compare and store min of 16 R values into global min
+ pMinG[0] = _mm_min_epi8(p1[1], pMinG[0]); //compare and store min of 16 G values into global min
+ pMinB[0] = _mm_min_epi8(p1[2], pMinB[0]); //compare and store min of 16 B values into global min
+}
+
+inline void reduce_min_i48_host(__m128i *pMinR, __m128i *pMinG, __m128i *pMinB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi8(pMinR[0], pMinG[0]), _mm_unpackhi_epi8(pMinR[0], pMinG[0]));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi8(pMinB[0], zero), _mm_unpackhi_epi8(pMinB[0], zero));
+ px[1] = _mm_min_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_min_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_min_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void reduce_max_i32_host(__m256i *pMax, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ __m128i mask = _mm_set_epi8(0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,7);
+ px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMax[0]), _mm256_extracti128_si256(pMax[0], 1));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(zero, px[0]), _mm_unpackhi_epi8(zero, px[0]));
+ px[0] = _mm_max_epi16(_mm_unpacklo_epi16(zero, px[0]), _mm_unpackhi_epi16(zero, px[0]));
+ px[0] = _mm_max_epi32(_mm_unpacklo_epi32(zero, px[0]), _mm_unpackhi_epi32(zero, px[0]));
+ result[0] = _mm_shuffle_epi8(px[0], mask);
+}
+
+inline void compute_max_i96_host(__m256i *p1, __m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB)
+{
+ pMaxR[0] = _mm256_max_epi8(p1[0], pMaxR[0]); //compare and store max of 32 R values into global max
+ pMaxG[0] = _mm256_max_epi8(p1[1], pMaxG[0]); //compare and store max of 32 G values into global max
+ pMaxB[0] = _mm256_max_epi8(p1[2], pMaxB[0]); //compare and store max of 32 B values into global max
+}
+
+inline void reduce_max_i96_host(__m256i *pMaxR, __m256i *pMaxG, __m256i *pMaxB, __m128i *result)
+{
+ __m128i px[4];
+ __m128i zero = _mm_setzero_si128();
+ px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxR[0]), _mm256_extracti128_si256(pMaxR[0], 1));
+ px[1] = _mm_max_epi8(_mm256_castsi256_si128(pMaxG[0]), _mm256_extracti128_si256(pMaxG[0], 1));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], px[1]), _mm_unpackhi_epi8(px[0], px[1]));
+ px[0] = _mm_max_epi8(_mm256_castsi256_si128(pMaxB[0]), _mm256_extracti128_si256(pMaxB[0], 1));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(px[0], zero), _mm_unpackhi_epi8(px[0], zero));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+inline void compute_max_i48_host(__m128i *p1, __m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB)
+{
+ pMaxR[0] = _mm_max_epi8(p1[0], pMaxR[0]); //compare and store max of 16 R values into global max
+ pMaxG[0] = _mm_max_epi8(p1[1], pMaxG[0]); //compare and store max of 16 G values into global max
+ pMaxB[0] = _mm_max_epi8(p1[2], pMaxB[0]); //compare and store max of 16 B values into global max
+}
+
+inline void reduce_max_i48_host(__m128i *pMaxR, __m128i *pMaxG, __m128i *pMaxB, __m128i *result)
+{
+ __m128i px[2];
+ __m128i zero = _mm_setzero_si128();
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxR[0], pMaxG[0]), _mm_unpackhi_epi8(pMaxR[0], pMaxG[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi8(pMaxB[0], zero), _mm_unpackhi_epi8(pMaxB[0], zero));
+ px[1] = _mm_max_epi8(_mm_unpacklo_epi16(px[1], px[0]), _mm_unpackhi_epi16(px[1], px[0]));
+ px[0] = _mm_max_epi8(_mm_unpacklo_epi32(px[1], zero), _mm_unpackhi_epi32(px[1], zero));
+ result[0] = _mm_max_epi8(_mm_unpacklo_epi64(px[0], zero), _mm_unpackhi_epi64(px[0], zero));
+}
+
+#endif //RPP_CPU_COMMON_H
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index ff30de027..d03ec0e79 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -75,7 +75,7 @@ typedef union
#define SIMD_GET_PS(name) (*(const __m128 *)_xmm_const_##name)
-const __m128 xmm_p0 = _mm_set1_ps(0.0f);
+const __m128 xmm_p0 = _mm_setzero_ps();
const __m128 xmm_p1 = _mm_set1_ps(1.0f);
const __m128 xmm_p2 = _mm_set1_ps(2.0f);
const __m128 xmm_pm2 = _mm_set1_ps(-2.0f);
@@ -243,7 +243,7 @@ inline void rpp_mm256_print_epi8(__m256i vPrintArray)
printf("\n");
for (int ct = 0; ct < 32; ct++)
{
- printf("%d ", printArray[ct]);
+ printf("%d ", (unsigned char)printArray[ct]);
}
}
@@ -1271,6 +1271,20 @@ inline void rpp_load16_u8_to_u32_avx(Rpp8u *srcPtr, __m256i *p)
p[1] = _mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15)); /* Contains pixels 09-16 */
}
+inline void rpp_load96_u8_avx(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m256i *p)
+{
+ p[0] = _mm256_loadu_si256((__m256i *)srcPtrR);
+ p[1] = _mm256_loadu_si256((__m256i *)srcPtrG);
+ p[2] = _mm256_loadu_si256((__m256i *)srcPtrB);
+}
+
+inline void rpp_load96_i8_avx(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m256i *p)
+{
+ p[0] = _mm256_load_si256((__m256i *)srcPtrR);
+ p[1] = _mm256_load_si256((__m256i *)srcPtrG);
+ p[2] = _mm256_load_si256((__m256i *)srcPtrB);
+}
+
inline void rpp_load24_f32pkd3_to_f32pln3_avx(Rpp32f *srcPtr, __m256 *p)
{
__m128 p128[8];
@@ -1478,6 +1492,16 @@ inline void rpp_store4_f64_to_f64_avx(Rpp64f *dstPtr, __m256d *p)
_mm256_storeu_pd(dstPtr, p[0]);
}
+inline void rpp_store16_u8_to_u8(Rpp8u *dstPtr, __m128i *p)
+{
+ _mm_storeu_si128((__m128i *)dstPtr, p[0]);
+}
+
+inline void rpp_store16_i8(Rpp8s *dstPtr, __m128i *p)
+{
+ _mm_store_si128((__m128i *)dstPtr, p[0]);
+}
+
inline void rpp_store8_f32_to_f16_avx(Rpp16f *dstPtr, __m256 *p)
{
__m128i px128 = _mm256_cvtps_ph(p[0], _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index a7412aa2d..d9c0ce02d 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -184,6 +184,13 @@ inline void generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp32u k
}
}
+// Retrieve Min and Max given a datatype
+
+inline void getImageBitDepthMinMax(uchar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(float *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(half *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(0, 255); }
+inline void getImageBitDepthMinMax(schar *srcPtr, float2 *bitDepthMinMax_f2) { *bitDepthMinMax_f2 = make_float2(-128, 127); }
+
/******************** DEVICE FUNCTIONS ********************/
// -------------------- Set 0 - Range checks and Range adjustment --------------------
@@ -1560,6 +1567,20 @@ __device__ __forceinline__ void rpp_hip_load24_pkd3_to_int24_pln3(schar *srcPtr,
// /******************** DEVICE MATH HELPER FUNCTIONS ********************/
+// float8 min
+
+__device__ __forceinline__ void rpp_hip_math_min8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+ *dstPtr = fminf(fminf(fminf(fminf(fminf(fminf(fminf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
+// float8 max
+
+__device__ __forceinline__ void rpp_hip_math_max8(d_float8 *srcPtr_f8, float *dstPtr)
+{
+ *dstPtr = fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(fmaxf(srcPtr_f8->f1[0], srcPtr_f8->f1[1]), srcPtr_f8->f1[2]), srcPtr_f8->f1[3]), srcPtr_f8->f1[4]), srcPtr_f8->f1[5]), srcPtr_f8->f1[6]), srcPtr_f8->f1[7]);
+}
+
// d_float16 floor
__device__ __forceinline__ void rpp_hip_math_floor16(d_float16 *srcPtr_f16, d_float16 *dstPtr_f16)
diff --git a/src/modules/cpu/host_tensor_statistical_operations.hpp b/src/modules/cpu/host_tensor_statistical_operations.hpp
index dae3e6236..32b8b62b5 100644
--- a/src/modules/cpu/host_tensor_statistical_operations.hpp
+++ b/src/modules/cpu/host_tensor_statistical_operations.hpp
@@ -26,5 +26,7 @@ SOFTWARE.
#define HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
#include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
#endif // HOST_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/tensor_max.hpp b/src/modules/cpu/kernel/tensor_max.hpp
new file mode 100644
index 000000000..0380f4ef6
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_max.hpp
@@ -0,0 +1,847 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_max_u8_u8_host(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8u max = 0;
+ Rpp8u resultAvx[16];
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMax = _mm256_setzero_si256();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+ pMax = _mm256_max_epu8(p1, pMax); //compare and store max of 32 values into global max
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max(*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_32_host(&pMax, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ max = std::max(resultAvx[0], max);
+#endif
+ maxArr[batchCount] = max;
+ }
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+ Rpp8u resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMaxR = _mm256_setzero_si256();
+ __m256i pMaxG = pMaxR;
+ __m256i pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_max_96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max(*srcPtrTempR++, maxR);
+ maxG = std::max(*srcPtrTempG++, maxG);
+ maxB = std::max(*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8u maxC = 0, maxR = 0, maxG = 0, maxB = 0;
+ Rpp8u resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMaxR = _mm_setzero_si128();
+ __m128i pMaxG = pMaxR;
+ __m128i pMaxB = pMaxR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+ compute_max_48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max(srcPtrTemp[0], maxR);
+ maxG = std::max(srcPtrTemp[1], maxG);
+ maxB = std::max(srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f32_f32_host(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f max = 0.0;
+ Rpp32f resultAvx[4];
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMax = _mm256_setzero_ps();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+ compute_max_float8_host(&p1, &pMax);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max(*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128 result;
+ reduce_max_float8_host(&pMax, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+ maxArr[batchCount] = max;
+ }
+
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max(*srcPtrTempR++, maxR);
+ maxG = std::max(*srcPtrTempG++, maxG);
+ maxB = std::max(*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max(srcPtrTemp[0], maxR);
+ maxG = std::max(srcPtrTemp[1], maxG);
+ maxB = std::max(srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_f16_f16_host(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f max = 0.0;
+ Rpp32f resultAvx[4];
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMax = _mm256_setzero_ps();
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[8];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+ compute_max_float8_host(&p1, &pMax);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max((Rpp32f)*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128 result;
+ reduce_max_float8_host(&pMax, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ max = std::max(std::max(resultAvx[0], resultAvx[1]), max);
+#endif
+ maxArr[batchCount] = (Rpp16f)max;
+ }
+
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+ srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+ srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max((Rpp32f)*srcPtrTempR++, maxR);
+ maxG = std::max((Rpp32f)*srcPtrTempG++, maxG);
+ maxB = std::max((Rpp32f)*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+
+#endif
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = (Rpp16f)maxR;
+ maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+ maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+ maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f maxC = 0.0, maxR = 0.0, maxG = 0.0, maxB = 0.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMaxR = _mm256_setzero_ps();
+ __m256 pMaxG = pMaxR;
+ __m256 pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+ compute_max_float24_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max((Rpp32f)srcPtrTemp[0], maxR);
+ maxG = std::max((Rpp32f)srcPtrTemp[1], maxG);
+ maxB = std::max((Rpp32f)srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_max_float24_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ maxR = std::max(std::max(resultAvx[0], resultAvx[1]), maxR);
+ maxG = std::max(std::max(resultAvx[2], resultAvx[3]), maxG);
+ maxB = std::max(std::max(resultAvx[4], resultAvx[5]), maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = (Rpp16f)maxR;
+ maxArr[maxArrIndex + 1] = (Rpp16f)maxG;
+ maxArr[maxArrIndex + 2] = (Rpp16f)maxB;
+ maxArr[maxArrIndex + 3] = (Rpp16f)maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_max_i8_i8_host(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor max 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8s max = INT8_MIN;
+ Rpp8s resultAvx[16];
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMax = _mm256_set1_epi8(INT8_MIN);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+ pMax = _mm256_max_epi8(p1, pMax); //compare and store max of 32 values into global max
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ max = std::max(*srcPtrTemp++, max);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_i32_host(&pMax, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ max = std::max(resultAvx[0], max);
+#endif
+ maxArr[batchCount] = max;
+ }
+ // Tensor max 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+ Rpp8s resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMaxR = _mm256_set1_epi8(INT8_MIN);
+ __m256i pMaxG = pMaxR;
+ __m256i pMaxB = pMaxR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_max_i96_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ maxR = std::max(*srcPtrTempR++, maxR);
+ maxG = std::max(*srcPtrTempG++, maxG);
+ maxB = std::max(*srcPtrTempB++, maxB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_i96_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+
+ // Tensor max 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u maxArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8s maxC = INT8_MIN, maxR = INT8_MIN, maxG = INT8_MIN, maxB = INT8_MIN;
+ Rpp8s resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMaxR = _mm_set1_epi8(INT8_MIN);
+ __m128i pMaxG = pMaxR;
+ __m128i pMaxB = pMaxR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+ compute_max_i48_host(p, &pMaxR, &pMaxG, &pMaxB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ maxR = std::max(srcPtrTemp[0], maxR);
+ maxG = std::max(srcPtrTemp[1], maxG);
+ maxB = std::max(srcPtrTemp[2], maxB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_max_i48_host(&pMaxR, &pMaxG, &pMaxB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ maxR = std::max(resultAvx[0], maxR);
+ maxG = std::max(resultAvx[1], maxG);
+ maxB = std::max(resultAvx[2], maxB);
+#endif
+ }
+ maxC = std::max(std::max(maxR, maxG), maxB);
+ maxArr[maxArrIndex] = maxR;
+ maxArr[maxArrIndex + 1] = maxG;
+ maxArr[maxArrIndex + 2] = maxB;
+ maxArr[maxArrIndex + 3] = maxC;
+ }
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/tensor_min.hpp b/src/modules/cpu/kernel/tensor_min.hpp
new file mode 100644
index 000000000..15b9b77ba
--- /dev/null
+++ b/src/modules/cpu/kernel/tensor_min.hpp
@@ -0,0 +1,845 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus tensor_min_u8_u8_host(Rpp8u *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8u min = 255;
+ Rpp8u resultAvx[16];
+
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMin = _mm256_set1_epi8((char)255);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_loadu_si256((__m256i *)srcPtrTemp);
+ pMin = _mm256_min_epu8(p1, pMin);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min(*srcPtrTemp++, min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_32_host(&pMin, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+ Rpp8u resultAvx[16];
+
+ Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMinR = _mm256_set1_epi8((char)255);
+ __m256i pMinG = pMinR;
+ __m256i pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_u8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_min_96_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min(*srcPtrTempR++, minR);
+ minG = std::min(*srcPtrTempG++, minG);
+ minB = std::min(*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_96_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8u minC = 255, minR = 255, minG = 255, minB = 255;
+ Rpp8u resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMinR = _mm_set1_epi8((char)255);
+ __m128i pMinG = pMinR;
+ __m128i pMinB = pMinR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtrTemp, p);
+ compute_min_48_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min(srcPtrTemp[0], minR);
+ minG = std::min(srcPtrTemp[1], minG);
+ minB = std::min(srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+ __m128i result;
+ reduce_min_48_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_u8_to_u8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f32_f32_host(Rpp32f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f min = 255.0;
+ Rpp32f resultAvx[4];
+
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, &p1);
+ compute_min_float8_host(&p1, &pMin);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min(*srcPtrTemp++, min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m128 result;
+ reduce_min_float8_host(&pMin, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min(*srcPtrTempR++, minR);
+ minG = std::min(*srcPtrTempG++, minG);
+ minB = std::min(*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min(srcPtrTemp[0], minR);
+ minG = std::min(srcPtrTemp[1], minG);
+ minB = std::min(srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_f16_f16_host(Rpp16f *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp32f min = 255.0;
+ Rpp32f resultAvx[4];
+
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256 pMin = _mm256_set1_ps(255.0);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[8];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p1;
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, &p1);
+ compute_min_float8_host(&p1, &pMin);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min((Rpp32f)*srcPtrTemp++, min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m128 result;
+ reduce_min_float8_host(&pMin, &result);
+ rpp_simd_store(rpp_store4_f32_to_f32, resultAvx, &result);
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = (Rpp16f) min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtrTempR_ps[8], srcPtrTempG_ps[8], srcPtrTempB_ps[8];
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtrTempR_ps[cnt] = (Rpp32f) srcPtrTempR[cnt];
+ srcPtrTempG_ps[cnt] = (Rpp32f) srcPtrTempG[cnt];
+ srcPtrTempB_ps[cnt] = (Rpp32f) srcPtrTempB[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min((Rpp32f)*srcPtrTempR++, minR);
+ minG = std::min((Rpp32f)*srcPtrTempG++, minG);
+ minB = std::min((Rpp32f)*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = (Rpp16f) minR;
+ minArr[minArrIndex + 1] = (Rpp16f) minG;
+ minArr[minArrIndex + 2] = (Rpp16f) minB;
+ minArr[minArrIndex + 3] = (Rpp16f) minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32f minC = 255.0, minR = 255.0, minG = 255.0, minB = 255.0;
+ Rpp32f resultAvx[8];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+#if __AVX2__
+ __m256 pMinR = _mm256_set1_ps(255.0);
+ __m256 pMinG = pMinR;
+ __m256 pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[24];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtrTemp_ps[cnt] = (Rpp32f) srcPtrTemp[cnt];
+ }
+ __m256 p[3];
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p);
+ compute_min_float24_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min((Rpp32f)srcPtrTemp[0], minR);
+ minG = std::min((Rpp32f)srcPtrTemp[1], minG);
+ minB = std::min((Rpp32f)srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m256 result;
+ reduce_min_float24_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, resultAvx, &result);
+
+ minR = std::min(std::min(resultAvx[0], resultAvx[1]), minR);
+ minG = std::min(std::min(resultAvx[2], resultAvx[3]), minG);
+ minB = std::min(std::min(resultAvx[4], resultAvx[5]), minB);
+#endif
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = (Rpp16f) minR;
+ minArr[minArrIndex + 1] = (Rpp16f) minG;
+ minArr[minArrIndex + 2] = (Rpp16f) minB;
+ minArr[minArrIndex + 3] = (Rpp16f) minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
+
+RppStatus tensor_min_i8_i8_host(Rpp8s *srcPtr,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+ for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtrImage;
+ srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtrChannel;
+ srcPtrChannel = srcPtrImage + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+
+ Rpp32u alignedLength = (bufferLength / 96) * 96;
+ Rpp32u vectorIncrement = 96;
+ Rpp32u vectorIncrementPerChannel = 32;
+
+ // Tensor min 1 channel (NCHW)
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ alignedLength = (bufferLength / vectorIncrementPerChannel) * vectorIncrementPerChannel;
+ vectorIncrement = vectorIncrementPerChannel;
+ Rpp8s min = 127;
+ Rpp8s resultAvx[16];
+
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+#if __AVX2__
+ __m256i pMin = _mm256_set1_epi8((char)127);
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256i p1 = _mm256_load_si256((__m256i *)srcPtrTemp);
+ pMin = _mm256_min_epi8(p1, pMin); //compare and store min of 32 values into global min
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ min = std::min((*srcPtrTemp++), min);
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+
+#if __AVX2__
+ __m128i result;
+ reduce_min_i32_host(&pMin, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ min = std::min(std::min(resultAvx[0], resultAvx[1]), min);
+#endif
+ minArr[batchCount] = min;
+ }
+
+ // Tensor min 3 channel (NCHW)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+ Rpp8s resultAvx[16];
+
+ Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB;
+ srcPtrRowR = srcPtrChannel;
+ srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+ srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+#if __AVX2__
+ __m256i pMinR = _mm256_set1_epi8((char)127);
+ __m256i pMinG = pMinR;
+ __m256i pMinB = pMinR;
+#endif
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB;
+ srcPtrTempR = srcPtrRowR;
+ srcPtrTempG = srcPtrRowG;
+ srcPtrTempB = srcPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256i p[3];
+ rpp_simd_load(rpp_load96_i8_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+ compute_min_i96_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTempR += vectorIncrementPerChannel;
+ srcPtrTempG += vectorIncrementPerChannel;
+ srcPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ minR = std::min(*srcPtrTempR++, minR);
+ minG = std::min(*srcPtrTempG++, minG);
+ minB = std::min(*srcPtrTempB++, minB);
+ }
+ srcPtrRowR += srcDescPtr->strides.hStride;
+ srcPtrRowG += srcDescPtr->strides.hStride;
+ srcPtrRowB += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_i96_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+#endif
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+
+ // Tensor min 3 channel (NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u minArrIndex = batchCount * 4;
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp8s minC = 127, minR = 127, minG = 127, minB = 127;
+ Rpp8s resultAvx[16];
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtrRow;
+ srcPtrRow = srcPtrChannel;
+
+ __m128i pMinR = _mm_set1_epi8((char)127);
+ __m128i pMinG = pMinR;
+ __m128i pMinB = pMinR;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtrTemp;
+ srcPtrTemp = srcPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p[3];
+ rpp_simd_load(rpp_load48_i8pkd3_to_i8pln3, srcPtrTemp, p);
+ compute_min_i48_host(p, &pMinR, &pMinG, &pMinB);
+
+ srcPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ minR = std::min(srcPtrTemp[0], minR);
+ minG = std::min(srcPtrTemp[1], minG);
+ minB = std::min(srcPtrTemp[2], minB);
+ srcPtrTemp += 3;
+ }
+ srcPtrRow += srcDescPtr->strides.hStride;
+ }
+#if __AVX2__
+ __m128i result;
+ reduce_min_i48_host(&pMinR, &pMinG, &pMinB, &result);
+ rpp_simd_store(rpp_store16_i8, resultAvx, &result);
+
+ minR = std::min(resultAvx[0], minR);
+ minG = std::min(resultAvx[1], minG);
+ minB = std::min(resultAvx[2], minB);
+#endif
+ }
+ minC = std::min(std::min(minR, minG), minB);
+ minArr[minArrIndex] = minR;
+ minArr[minArrIndex + 1] = minG;
+ minArr[minArrIndex + 2] = minB;
+ minArr[minArrIndex + 3] = minC;
+ }
+ }
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_statistical_operations.hpp b/src/modules/hip/hip_tensor_statistical_operations.hpp
index 328a232a1..c79e0a951 100644
--- a/src/modules/hip/hip_tensor_statistical_operations.hpp
+++ b/src/modules/hip/hip_tensor_statistical_operations.hpp
@@ -23,8 +23,9 @@ SOFTWARE.
*/
#ifndef HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
-#define HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
#include "kernel/tensor_sum.hpp"
+#include "kernel/tensor_min.hpp"
+#include "kernel/tensor_max.hpp"
-#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
\ No newline at end of file
+#endif // HIP_TENSOR_STATISTICAL_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/tensor_max.hpp b/src/modules/hip/kernel/tensor_max.hpp
new file mode 100644
index 000000000..b47fce024
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_max.hpp
@@ -0,0 +1,400 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template
+__global__ void tensor_max_grid_3channel_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialRMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialGMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialBMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength) * 3;
+ partialRMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+ partialGMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+ partialBMax_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x * 3;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local mmemory
+
+ rpp_hip_math_max8(&src_f24.f8[0], &partialRMax_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[1], &partialGMax_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[2], &partialBMax_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMax_smem[hipThreadIdx_x] = fmaxf(partialRMax_smem[hipThreadIdx_x], partialRMax_smem[hipThreadIdx_x + threadMax]);
+ partialGMax_smem[hipThreadIdx_x] = fmaxf(partialGMax_smem[hipThreadIdx_x], partialGMax_smem[hipThreadIdx_x + threadMax]);
+ partialBMax_smem[hipThreadIdx_x] = fmaxf(partialBMax_smem[hipThreadIdx_x], partialBMax_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int dstIdx = hipBlockIdx_z * 4;
+ dstPtr[dstIdx] = (T) partialRMax_smem[0];
+ dstPtr[dstIdx + 1] = (T) partialGMax_smem[0];
+ dstPtr[dstIdx + 2] = (T) partialBMax_smem[0];
+ dstPtr[dstIdx + 3] = (T) (fmaxf(fmaxf(partialRMax_smem[0], partialGMax_smem[0]), partialBMax_smem[0]));
+ }
+}
+
+template
+__global__ void tensor_max_grid_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialMax_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength);
+ partialMax_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_max8(&src_f8, &partialMax_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMax_smem[hipThreadIdx_x] = fmaxf(partialMax_smem[hipThreadIdx_x], partialMax_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ dstPtr[hipBlockIdx_z] = (T) (partialMax_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template
+__global__ void tensor_max_pkd3_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *maxArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+ __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+ __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+ float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel
+ float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel
+ float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel
+ uint srcIdx = (id_z * srcStridesNH.x);
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+
+ rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+ partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+ partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ maxArr[idx] = partialRMaxRowPtr_smem[0];
+ maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+ maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_max_pln3_hip(T *srcPtr,
+ uint3 srcStridesNCH,
+ float *maxArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialGMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialBMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ float *partialRMaxRowPtr_smem = &partialRMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialGMaxRowPtr_smem = &partialGMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialBMaxRowPtr_smem = &partialBMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ d_float24 src_f24;
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+ rpp_hip_math_max8(&src_f24.f8[0], &partialRMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[1], &partialGMaxRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_max8(&src_f24.f8[2], &partialBMaxRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialRMaxRowPtr_smem[hipThreadIdx_x], partialRMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialGMaxRowPtr_smem[hipThreadIdx_x], partialGMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialBMaxRowPtr_smem[hipThreadIdx_x], partialBMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMaxRowPtr_smem[0] = fmaxf(partialRMaxRowPtr_smem[0], partialRMaxRowPtr_smem[increment]);
+ partialGMaxRowPtr_smem[0] = fmaxf(partialGMaxRowPtr_smem[0], partialGMaxRowPtr_smem[increment]);
+ partialBMaxRowPtr_smem[0] = fmaxf(partialBMaxRowPtr_smem[0], partialBMaxRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ maxArr[idx] = partialRMaxRowPtr_smem[0];
+ maxArr[idx + 1] = partialGMaxRowPtr_smem[0];
+ maxArr[idx + 2] = partialBMaxRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_max_pln1_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *maxArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialMax_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ float *partialMaxRowPtr_smem = &partialMax_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ partialMaxRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+
+ rpp_hip_math_max8(&src_f8, &partialMaxRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after max compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMaxRowPtr_smem[hipThreadIdx_x] = fmaxf(partialMaxRowPtr_smem[hipThreadIdx_x], partialMaxRowPtr_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialMaxRowPtr_smem[0] = fmaxf(partialMaxRowPtr_smem[0], partialMaxRowPtr_smem[increment]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ maxArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMaxRowPtr_smem[0];
+ }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template
+RppStatus hip_exec_tensor_max(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ U *maxArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+ int globalThreads_y = srcDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+ int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+ int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+ int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+ float2 bitDepthMinMax_f2;
+ getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+ float minimum = bitDepthMinMax_f2.x;
+
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z;
+ float *partialMaxArr;
+ partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_pln1_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMaxArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_grid_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ gridDim_x * gridDim_y,
+ maxArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMaxArr;
+ partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_pln3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ partialMaxArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ gridDim_x * gridDim_y,
+ maxArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u partialMaxArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMaxArr;
+ partialMaxArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMaxArr, minimum, partialMaxArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_pkd3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMaxArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_max_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMaxArr,
+ gridDim_x * gridDim_y,
+ maxArr);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/tensor_min.hpp b/src/modules/hip/kernel/tensor_min.hpp
new file mode 100644
index 000000000..a883c4f3b
--- /dev/null
+++ b/src/modules/hip/kernel/tensor_min.hpp
@@ -0,0 +1,410 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 0 - Reduction Stage 2 --------------------
+
+template
+__global__ void tensor_min_grid_3channel_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialRMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialGMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+ __shared__ float partialBMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength) * 3;
+ partialRMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start of R channel using all 256 x 1 threads
+ partialGMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start of G channel using all 256 x 1 threads
+ partialBMin_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start of B channel using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x * 3;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= ((8 - (xBufferLength - (xBufferLength & ~7))) * 3); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+
+ rpp_hip_math_min8(&src_f24.f8[0], &partialRMin_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[1], &partialGMin_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[2], &partialBMin_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMin_smem[hipThreadIdx_x] = fminf(partialRMin_smem[hipThreadIdx_x], partialRMin_smem[hipThreadIdx_x + threadMax]);
+ partialGMin_smem[hipThreadIdx_x] = fminf(partialGMin_smem[hipThreadIdx_x], partialGMin_smem[hipThreadIdx_x + threadMax]);
+ partialBMin_smem[hipThreadIdx_x] = fminf(partialBMin_smem[hipThreadIdx_x], partialBMin_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ {
+ int dstIdx = hipBlockIdx_z * 4;
+ dstPtr[dstIdx] = (T) partialRMin_smem[0];
+ dstPtr[dstIdx + 1] = (T) partialGMin_smem[0];
+ dstPtr[dstIdx + 2] = (T) partialBMin_smem[0];
+ dstPtr[dstIdx + 3] = (T) (fminf(fminf(partialRMin_smem[0], partialGMin_smem[0]), partialBMin_smem[0]));
+ }
+}
+
+template
+__global__ void tensor_min_grid_result_hip(float *srcPtr,
+ uint xBufferLength,
+ T *dstPtr)
+{
+ int id_x = hipThreadIdx_x * 8;
+ int id_z = hipBlockIdx_z;
+
+ __shared__ float partialMin_smem[256]; // 1024 floats of src reduced to 256 in a 256 x 1 thread block
+
+ uint srcIdx = (id_z * xBufferLength);
+ partialMin_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start of buffer using all 256 x 1 threads
+
+ if (id_x >= xBufferLength)
+ return;
+
+ srcIdx += id_x;
+
+ if (id_x + 8 > xBufferLength)
+ srcIdx -= (8 - (xBufferLength - (xBufferLength & ~7))); // using difference between bufferLength and alignedLength, where alignedLength = (xBufferLength & ~7)
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_min8(&src_f8, &partialMin_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 256 floats on 256 threads per block in x dimension
+ for (int threadMax = 128; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMin_smem[hipThreadIdx_x] = fminf(partialMin_smem[hipThreadIdx_x], partialMin_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_x == 0)
+ dstPtr[hipBlockIdx_z] = (T) (partialMin_smem[0]);
+}
+
+
+// -------------------- Set 1 - Reduction Stage 1 --------------------
+
+template
+__global__ void tensor_min_pkd3_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *minArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for R channel
+ __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for G channel
+ __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block for B channel
+
+ float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for R Channel
+ float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for G Channel
+ float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS for B Channel
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 1]; // initialization of LDS for G channel to start value of G channel using all 16 x 16 threads
+ partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2]; // initialization of LDS for B channel to start value of B channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth) * 3;
+
+ d_float24 src_f24;
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr + srcIdx, &src_f24); // load 24 pixels to local memory
+
+ rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads();
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+ partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+ partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ minArr[idx] = partialRMinRowPtr_smem[0];
+ minArr[idx + 1] = partialGMinRowPtr_smem[0];
+ minArr[idx + 2] = partialBMinRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_min_pln3_hip(T *srcPtr,
+ uint3 srcStridesNCH,
+ float *minArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialRMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialGMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+ __shared__ float partialBMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ float *partialRMinRowPtr_smem = &partialRMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialGMinRowPtr_smem = &partialGMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ float *partialBMinRowPtr_smem = &partialBMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+
+ uint srcIdx = (id_z * srcStridesNCH.x);
+ partialRMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS for R channel to start value of R channel using all 16 x 16 threads
+ partialGMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + srcStridesNCH.y]; // initialization of LDS for G channel to start value of R channel using all 16 x 16 threads
+ partialBMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx + 2 * srcStridesNCH.y]; // initialization of LDS for B channel to start value of R channel using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+ d_float24 src_f24;
+ rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr + srcIdx, srcStridesNCH.y, &src_f24);
+
+ rpp_hip_math_min8(&src_f24.f8[0], &partialRMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[1], &partialGMinRowPtr_smem[hipThreadIdx_x]);
+ rpp_hip_math_min8(&src_f24.f8[2], &partialBMinRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ {
+ partialRMinRowPtr_smem[hipThreadIdx_x] = fminf(partialRMinRowPtr_smem[hipThreadIdx_x], partialRMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialGMinRowPtr_smem[hipThreadIdx_x] = fminf(partialGMinRowPtr_smem[hipThreadIdx_x], partialGMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ partialBMinRowPtr_smem[hipThreadIdx_x] = fminf(partialBMinRowPtr_smem[hipThreadIdx_x], partialBMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ }
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ {
+ partialRMinRowPtr_smem[0] = fminf(partialRMinRowPtr_smem[0], partialRMinRowPtr_smem[increment]);
+ partialGMinRowPtr_smem[0] = fminf(partialGMinRowPtr_smem[0], partialGMinRowPtr_smem[increment]);
+ partialBMinRowPtr_smem[0] = fminf(partialBMinRowPtr_smem[0], partialBMinRowPtr_smem[increment]);
+ }
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ {
+ int idx = ((hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x) * 3;
+ minArr[idx] = partialRMinRowPtr_smem[0];
+ minArr[idx + 1] = partialGMinRowPtr_smem[0];
+ minArr[idx + 2] = partialBMinRowPtr_smem[0];
+ }
+ }
+}
+
+template
+__global__ void tensor_min_pln1_hip(T *srcPtr,
+ uint2 srcStridesNH,
+ float *minArr,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ __shared__ float partialMin_smem[16][16]; // 16 rows of src, 128 reduced cols of src in a 16 x 16 thread block
+
+ uint srcIdx = (id_z * srcStridesNH.x);
+ float *partialMinRowPtr_smem = &partialMin_smem[hipThreadIdx_y][0]; // float pointer to beginning of each row in LDS
+ partialMinRowPtr_smem[hipThreadIdx_x] = srcPtr[srcIdx]; // initialization of LDS to start value using all 16 x 16 threads
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ return;
+
+ srcIdx += ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+
+ if (id_x + 8 > roiTensorPtrSrc[id_z].xywhROI.roiWidth)
+ srcIdx -= (id_x + 8 - roiTensorPtrSrc[id_z].xywhROI.roiWidth);
+
+ d_float8 src_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); // load 8 pixels to local memory
+ rpp_hip_math_min8(&src_f8, &partialMinRowPtr_smem[hipThreadIdx_x]);
+ __syncthreads(); // syncthreads after min compute
+
+ // Reduction of 16 floats on 16 threads per block in x dimension (for every y dimension)
+ for (int threadMax = 8; threadMax >= 1; threadMax /= 2)
+ {
+ if (hipThreadIdx_x < threadMax)
+ partialMinRowPtr_smem[hipThreadIdx_x] = fminf(partialMinRowPtr_smem[hipThreadIdx_x], partialMinRowPtr_smem[hipThreadIdx_x + threadMax]);
+ __syncthreads();
+ }
+
+ if (hipThreadIdx_x == 0)
+ {
+ // Reduction of 16 floats on 16 threads per block in y dimension
+ for (int threadMax = 8, increment = 128; threadMax >= 1; threadMax /= 2, increment /= 2)
+ {
+ if (hipThreadIdx_y < threadMax)
+ partialMinRowPtr_smem[0] = fminf(partialMinRowPtr_smem[0], partialMinRowPtr_smem[increment]);
+ __syncthreads();
+ }
+
+ // Final store to dst
+ if (hipThreadIdx_y == 0)
+ minArr[(hipBlockIdx_z * hipGridDim_y + hipBlockIdx_y) * hipGridDim_x + hipBlockIdx_x] = partialMinRowPtr_smem[0];
+ }
+}
+
+
+// -------------------- Set 2 - Kernel Executors --------------------
+
+template
+RppStatus hip_exec_tensor_min(T *srcPtr,
+ RpptDescPtr srcDescPtr,
+ U *minArr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle &handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (srcDescPtr->w + 7) >> 3;
+ int globalThreads_y = srcDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+ int gridDim_x = (int) ceil((float)globalThreads_x/LOCAL_THREADS_X);
+ int gridDim_y = (int) ceil((float)globalThreads_y/LOCAL_THREADS_Y);
+ int gridDim_z = (int) ceil((float)globalThreads_z/LOCAL_THREADS_Z);
+ float2 bitDepthMinMax_f2;
+ getImageBitDepthMinMax(srcPtr, &bitDepthMinMax_f2);
+ float maximum = bitDepthMinMax_f2.y;
+
+ if ((srcDescPtr->c == 1) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z;
+ float *partialMinArr;
+ partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_pln1_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMinArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_grid_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMinArr,
+ gridDim_x * gridDim_y,
+ minArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMinArr;
+ partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_pln3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ partialMinArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMinArr,
+ gridDim_x * gridDim_y,
+ minArr);
+ }
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32u partialMinArrLength = gridDim_x * gridDim_y * gridDim_z * 3;
+ float *partialMinArr;
+ partialMinArr = handle.GetInitHandle()->mem.mgpu.maskArr.floatmem;
+ hipMemsetAsync(partialMinArr, maximum, partialMinArrLength * sizeof(float), handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_pkd3_hip,
+ dim3(gridDim_x, gridDim_y, gridDim_z),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ partialMinArr,
+ roiTensorPtrSrc);
+ hipStreamSynchronize(handle.GetStream());
+ hipLaunchKernelGGL(tensor_min_grid_3channel_result_hip,
+ dim3(1, 1, gridDim_z),
+ dim3(256, 1, 1),
+ 0,
+ handle.GetStream(),
+ partialMinArr,
+ gridDim_x * gridDim_y,
+ minArr);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_statistical_operations.cpp b/src/modules/rppt_tensor_statistical_operations.cpp
index f17028e5e..28313a88f 100644
--- a/src/modules/rppt_tensor_statistical_operations.cpp
+++ b/src/modules/rppt_tensor_statistical_operations.cpp
@@ -107,6 +107,140 @@ RppStatus rppt_tensor_sum_host(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t minArr,
+ Rpp32u minArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (minArrLength < srcDescPtr->n) // 1 min for each image
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (minArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ tensor_min_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ tensor_min_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ tensor_min_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ tensor_min_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(minArr),
+ minArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_host(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t maxArr,
+ Rpp32u maxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (maxArrLength < srcDescPtr->n) // 1 min for each image
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (maxArrLength < srcDescPtr->n * 4) // min of each channel, and min of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ tensor_max_u8_u8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ tensor_max_f16_f16_host((Rpp16f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ tensor_max_f32_f32_host((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ tensor_max_i8_i8_host(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(maxArr),
+ maxArrLength,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams);
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
@@ -184,4 +318,126 @@ RppStatus rppt_tensor_sum_gpu(RppPtr_t srcPtr,
return RPP_SUCCESS;
}
+
+/******************** tensor_min ********************/
+
+RppStatus rppt_tensor_min_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t imageMinArr,
+ Rpp32u imageMinArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (imageMinArrLength < srcDescPtr->n) // min of single channel
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (imageMinArrLength < srcDescPtr->n * 4) // min of each channel, and overall min of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ hip_exec_tensor_min((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_tensor_min((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ hip_exec_tensor_min(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMinArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** tensor_max ********************/
+
+RppStatus rppt_tensor_max_gpu(RppPtr_t srcPtr,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t imageMaxArr,
+ Rpp32u imageMaxArrLength,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ if (srcDescPtr->c == 1)
+ {
+ if (imageMaxArrLength < srcDescPtr->n) // max of single channel
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+ else if (srcDescPtr->c == 3)
+ {
+ if (imageMaxArrLength < srcDescPtr->n * 4) // max of each channel, and overall max of all 3 channels
+ return RPP_ERROR_INSUFFICIENT_DST_BUFFER_LENGTH;
+ }
+
+ if (srcDescPtr->dataType == RpptDataType::U8)
+ {
+ hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F16)
+ {
+ hip_exec_tensor_max((half*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::F32)
+ {
+ hip_exec_tensor_max((Rpp32f*) (static_cast(srcPtr) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if (srcDescPtr->dataType == RpptDataType::I8)
+ {
+ hip_exec_tensor_max(static_cast(srcPtr) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(imageMaxArr),
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
#endif // backend
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index 48537de29..7bd46b39e 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -68,9 +68,9 @@ int main(int argc, char **argv)
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+ bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
- bool reductionTypeCase = (testCase == 87);
unsigned int verbosity = atoi(argv[11]);
unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
@@ -323,23 +323,20 @@ int main(int argc, char **argv)
double wallTime;
string testCaseName;
- // Initialize buffers for any reductionType functions
+ // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
void *reductionFuncResultArr;
Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
-
- if(reductionTypeCase)
+ if (reductionTypeCase)
{
- if(dstDescPtr->dataType == RpptDataType::U8)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64u)));
- else if(dstDescPtr->dataType == RpptDataType::F16)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::F32)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::I8)
- CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * sizeof(Rpp64s)));
+ int bitDepthByteSize = 0;
+ if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+ bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+ else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+ bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f
+ CHECK(hipHostMalloc(&reductionFuncResultArr, reductionFuncResultArrLength * bitDepthByteSize));
}
- //Allocate hip memory for src/dst
+ // Allocate hip memory for src/dst
CHECK(hipMalloc(&d_input, inputBufferSize));
CHECK(hipMalloc(&d_output, outputBufferSize));
if(dualInputCase)
@@ -1044,6 +1041,30 @@ int main(int argc, char **argv)
break;
}
+ case 88:
+ {
+ testCaseName = "tensor_min";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_min_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 89:
+ {
+ testCaseName = "tensor_max";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_max_gpu(d_input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
missingFuncFlag = 1;
break;
@@ -1071,33 +1092,41 @@ int main(int argc, char **argv)
if(srcDescPtr->c == 3)
printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
else if(srcDescPtr->c == 1)
+ {
printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+ reductionFuncResultArrLength = srcDescPtr->n;
+ }
- if(dstDescPtr->dataType == RpptDataType::U8)
+ // print reduction functions output array based on different bit depths, and precision desired
+ int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+ if (dstDescPtr->dataType == RpptDataType::U8)
{
- Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %llu ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F16)
+ else if (dstDescPtr->dataType == RpptDataType::F16)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F32)
+ else if (dstDescPtr->dataType == RpptDataType::F32)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::I8)
+ else if (dstDescPtr->dataType == RpptDataType::I8)
{
- Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %lld ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
-
printf("\n");
/*Compare the output of the function with golden outputs only if
@@ -1105,7 +1134,12 @@ int main(int argc, char **argv)
2.input bit depth 0 (U8)
3.source and destination layout are the same*/
if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
- compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ {
+ if (testCase == 87)
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ else
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ }
}
else
{
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index cabc4015f..2e8054332 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -315,11 +315,11 @@ def rpp_test_suite_parser_and_validator():
parser = argparse.ArgumentParser()
parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
- parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
- parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
- parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
- parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
- parser.add_argument('--profiling', type = str , default = 'NO', help = 'Run with profiler? - (YES/NO)', required = False)
+ parser.add_argument("--case_start", type = int, default = 0, help="Testing range starting case # - (0:90)")
+ parser.add_argument("--case_end", type = int, default = 90, help="Testing range ending case # - (0:90)")
+ parser.add_argument('--test_type', type = int, default = 0, help="Type of Test - (0 = Unit tests / 1 = Performance tests)")
+ parser.add_argument('--case_list', nargs = "+", help="List of case numbers to list", required=False)
+ parser.add_argument('--profiling', type = str , default='NO', help='Run with profiler? - (YES/NO)', required=False)
parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
parser.add_argument('--decoder_type', type = int, default = 0, help = "Type of Decoder to decode the input data - (0 = TurboJPEG / 1 = OpenCV)")
parser.add_argument('--num_runs', type = int, default = 1, help = "Specifies the number of runs for running the performance tests")
@@ -334,8 +334,8 @@ def rpp_test_suite_parser_and_validator():
validate_path(qaInputFile)
# validate the parameters passed by user
- if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
- print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+ if ((args.case_start < 0 or args.case_start > 90) or (args.case_end < 0 or args.case_end > 90)):
+ print("Starting case# and Ending case# must be in the 0:90 range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
print("Ending case# must be greater than starting case#. Aborting!")
@@ -349,7 +349,7 @@ def rpp_test_suite_parser_and_validator():
elif args.decoder_type < 0 or args.decoder_type > 1:
print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
exit(0)
- elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+ elif args.case_list is not None and args.case_start > 0 and args.case_end < 90:
print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
exit(0)
elif args.num_runs <= 0:
@@ -376,8 +376,8 @@ def rpp_test_suite_parser_and_validator():
args.case_list = [str(x) for x in args.case_list]
else:
for case in args.case_list:
- if int(case) < 0 or int(case) > 87:
- print("The case# must be in the 0:87 range!")
+ if int(case) < 0 or int(case) > 90:
+ print("The case# must be in the 0:90 range!")
exit(0)
return args
@@ -458,8 +458,8 @@ def rpp_test_suite_parser_and_validator():
if qaMode == 1 and case != "82":
srcPath1 = inFilePath1
srcPath2 = inFilePath2
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
for layout in range(3):
dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -476,8 +476,8 @@ def rpp_test_suite_parser_and_validator():
else:
if (testType == 1 and profilingOption == "NO"):
for case in caseList:
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
srcPath1 = ricapInFilePath
@@ -491,8 +491,8 @@ def rpp_test_suite_parser_and_validator():
NEW_FUNC_GROUP_LIST = [0, 15, 20, 29, 36, 40, 42, 49, 56, 65, 69]
for case in caseList:
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 87!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
if case == "82" and "--input_path1" not in sys.argv and "--input_path2" not in sys.argv:
srcPath1 = ricapInFilePath
@@ -696,7 +696,7 @@ def rpp_test_suite_parser_and_validator():
f.close()
# print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index fd198ba23..b698a2def 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -68,11 +68,12 @@ int main(int argc, char **argv)
bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
bool randomOutputCase = (testCase == 84);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
+ bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
bool noiseTypeCase = (testCase == 8);
bool pln1OutTypeCase = (testCase == 86);
+
unsigned int verbosity = atoi(argv[11]);
unsigned int additionalParam = additionalParamCase ? atoi(argv[7]) : 1;
- bool reductionTypeCase = (testCase == 87);
int roiList[4] = {atoi(argv[15]), atoi(argv[16]), atoi(argv[17]), atoi(argv[18])};
string scriptPath = argv[19];
@@ -140,6 +141,11 @@ int main(int argc, char **argv)
std::cerr << "\n Batchsize should be less than or equal to "<< MAX_BATCH_SIZE << " Aborting!";
exit(0);
}
+ else if(testCase == 82 && batchSize < 2)
+ {
+ std::cerr<<"\n RICAP only works with BatchSize > 1";
+ exit(0);
+ }
// Get function name
string funcName = augmentationMap[testCase];
@@ -310,6 +316,24 @@ int main(int argc, char **argv)
input_second = static_cast(calloc(inputBufferSize, 1));
output = static_cast(calloc(outputBufferSize, 1));
+ // Initialize buffers for any reductionType functions (testCase 87 - tensor_sum alone cannot return final sum as 8u/8s due to overflow. 8u inputs return 64u sums, 8s inputs return 64s sums)
+ void *reductionFuncResultArr;
+ Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
+ if (reductionTypeCase)
+ {
+ int bitDepthByteSize = 0;
+ if ((dstDescPtr->dataType == RpptDataType::U8) || (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ bitDepthByteSize = (testCase == 87) ? sizeof(Rpp64u) : sizeof(Rpp8u);
+ reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+ }
+ else if ((dstDescPtr->dataType == RpptDataType::F16) || (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ bitDepthByteSize = sizeof(Rpp32f); // using 32f outputs for 16f and 32f
+ reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, bitDepthByteSize));
+ }
+ }
+
// Set the number of threads to be used by OpenMP pragma for RPP batch processing on host.
// If numThreads value passed is 0, number of OpenMP threads used by RPP will be set to batch size
Rpp32u numThreads = 0;
@@ -321,21 +345,6 @@ int main(int argc, char **argv)
double cpuTime, wallTime;
string testCaseName;
- // Initialize buffers for any reductionType functions
- void *reductionFuncResultArr;
- Rpp32u reductionFuncResultArrLength = srcDescPtr->n * 4;
- if(reductionTypeCase)
- {
- if(dstDescPtr->dataType == RpptDataType::U8)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64u)));
- else if(dstDescPtr->dataType == RpptDataType::F16)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::F32)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp32f)));
- else if(dstDescPtr->dataType == RpptDataType::I8)
- reductionFuncResultArr = static_cast(calloc(reductionFuncResultArrLength, sizeof(Rpp64s)));
- }
-
// case-wise RPP API and measure time script for Unit and Performance test
printf("\nRunning %s %d times (each time with a batch size of %d images) and computing mean statistics...", func.c_str(), numRuns, batchSize);
for (int perfRunCount = 0; perfRunCount < numRuns; perfRunCount++)
@@ -1050,6 +1059,40 @@ int main(int argc, char **argv)
break;
}
+ case 88:
+ {
+ testCaseName = "tensor_min";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_min_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 89:
+ {
+ testCaseName = "tensor_max";
+
+ if(srcDescPtr->c == 1)
+ reductionFuncResultArrLength = srcDescPtr->n;
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_tensor_max_host(input, srcDescPtr, reductionFuncResultArr, reductionFuncResultArrLength, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
default:
missingFuncFlag = 1;
break;
@@ -1082,33 +1125,41 @@ int main(int argc, char **argv)
if(srcDescPtr->c == 3)
printf("\nReduction result (Batch of 3 channel images produces 4 results per image in batch): ");
else if(srcDescPtr->c == 1)
+ {
printf("\nReduction result (Batch of 1 channel images produces 1 result per image in batch): ");
+ reductionFuncResultArrLength = srcDescPtr->n;
+ }
- if(dstDescPtr->dataType == RpptDataType::U8)
+ // print reduction functions output array based on different bit depths, and precision desired
+ int precision = ((dstDescPtr->dataType == RpptDataType::F32) || (dstDescPtr->dataType == RpptDataType::F16)) ? 3 : 0;
+ if (dstDescPtr->dataType == RpptDataType::U8)
{
- Rpp64u *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %llu ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F16)
+ else if (dstDescPtr->dataType == RpptDataType::F16)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::F32)
+ else if (dstDescPtr->dataType == RpptDataType::F32)
{
- Rpp32f *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %0.3f ", (float)reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
- else if(dstDescPtr->dataType == RpptDataType::I8)
+ else if (dstDescPtr->dataType == RpptDataType::I8)
{
- Rpp64s *reductionOutPtr = static_cast(reductionFuncResultArr);
- for (int i = 0; i < reductionFuncResultArrLength; i++)
- printf(" %lld ", reductionOutPtr[i]);
+ if (testCase == 87)
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
+ else
+ print_array(static_cast(reductionFuncResultArr), reductionFuncResultArrLength, precision);
}
-
printf("\n");
/*Compare the output of the function with golden outputs only if
@@ -1116,7 +1167,12 @@ int main(int argc, char **argv)
2.input bit depth 0 (U8)
3.source and destination layout are the same*/
if(qaFlag && inputBitDepth == 0 && (srcDescPtr->layout == dstDescPtr->layout) && !(randomOutputCase))
- compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ {
+ if (testCase == 87)
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ else
+ compare_reduction_output(static_cast(reductionFuncResultArr), testCaseName, srcDescPtr, testCase, dst, scriptPath);
+ }
}
else
{
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index b40a0bf01..b08c4d5e8 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -244,8 +244,8 @@ def rpp_test_suite_parser_and_validator():
parser = argparse.ArgumentParser()
parser.add_argument("--input_path1", type = str, default = inFilePath1, help = "Path to the input folder 1")
parser.add_argument("--input_path2", type = str, default = inFilePath2, help = "Path to the input folder 2")
- parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:87)")
- parser.add_argument("--case_end", type = int, default = 87, help = "Testing range ending case # - (0:87)")
+ parser.add_argument("--case_start", type = int, default = 0, help = "Testing range starting case # - (0:89)")
+ parser.add_argument("--case_end", type = int, default = 89, help = "Testing range ending case # - (0:89)")
parser.add_argument('--test_type', type = int, default = 0, help = "Type of Test - (0 = Unit tests / 1 = Performance tests)")
parser.add_argument('--case_list', nargs = "+", help = "List of case numbers to list", required = False)
parser.add_argument('--qa_mode', type = int, default = 0, help = "Run with qa_mode? Output images from tests will be compared with golden outputs - (0 / 1)", required = False)
@@ -263,8 +263,8 @@ def rpp_test_suite_parser_and_validator():
validate_path(perfQaInputFile)
# validate the parameters passed by user
- if ((args.case_start < 0 or args.case_start > 87) or (args.case_end < 0 or args.case_end > 87)):
- print("Starting case# and Ending case# must be in the 0:87 range. Aborting!")
+ if ((args.case_start < 0 or args.case_start > 89) or (args.case_end < 0 or args.case_end > 89)):
+ print("Starting case# and Ending case# must be in the 0:89 range. Aborting!")
exit(0)
elif args.case_end < args.case_start:
print("Ending case# must be greater than starting case#. Aborting!")
@@ -278,7 +278,7 @@ def rpp_test_suite_parser_and_validator():
elif args.decoder_type < 0 or args.decoder_type > 1:
print("Decoder Type must be in the 0/1 (0 = OpenCV / 1 = TurboJPEG). Aborting")
exit(0)
- elif args.case_list is not None and args.case_start > 0 and args.case_end < 87:
+ elif args.case_list is not None and args.case_start > 0 and args.case_end < 89:
print("Invalid input! Please provide only 1 option between case_list, case_start and case_end")
exit(0)
elif args.num_runs <= 0:
@@ -302,8 +302,8 @@ def rpp_test_suite_parser_and_validator():
args.case_list = [str(x) for x in args.case_list]
else:
for case in args.case_list:
- if int(case) < 0 or int(case) > 87:
- print("The case# must be in the 0:87 range!")
+ if int(case) < 0 or int(case) > 89:
+ print("The case# must be in the 0:89 range!")
exit(0)
return args
@@ -381,8 +381,8 @@ def rpp_test_suite_parser_and_validator():
if qaMode == 1 and case != "82":
srcPath1 = inFilePath1
srcPath2 = inFilePath2
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
for layout in range(3):
dstPathTemp, log_file_layout = process_layout(layout, qaMode, case, dstPath)
@@ -397,8 +397,8 @@ def rpp_test_suite_parser_and_validator():
create_layout_directories(dstPath, layoutDict)
else:
for case in caseList:
- if int(case) < 0 or int(case) > 87:
- print(f"Invalid case number {case}. Case number must be in the range of 0 to 86!")
+ if int(case) < 0 or int(case) > 89:
+ print(f"Invalid case number {case}. Case number must be in the range of 0 to 89!")
continue
# if QA mode is enabled overwrite the input folders with the folders used for generating golden outputs
if qaMode == 1 and case != "82":
@@ -412,7 +412,7 @@ def rpp_test_suite_parser_and_validator():
run_performance_test(loggingFolder, log_file_layout, srcPath1, srcPath2, dstPath, case, numRuns, testType, layout, qaMode, decoderType, batchSize, roiList)
# print the results of qa tests
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
nonQACaseList = ['8', '24', '54', '84'] # Add cases present in supportedCaseList, but without QA support
if qaMode and testType == 0:
diff --git a/utilities/test_suite/README.md b/utilities/test_suite/README.md
index 76ecd9551..067bedb1d 100644
--- a/utilities/test_suite/README.md
+++ b/utilities/test_suite/README.md
@@ -94,8 +94,8 @@ The image test suite can be executed under 2 backend scenarios - (HOST/HIP):
The image test suite accepts the following command line arguments:
- input_path1: The path to the input folder 1. Default is $cwd/../TEST_IMAGES/three_images_mixed_src1
- input_path2: The path to the input folder 2. Default is $cwd/../TEST_IMAGES/three_images_mixed_src2
-- case_start: The starting case number for the test range (0-87). Default is 0
-- case_end: The ending case number for the test range (0-87). Default is 87
+- case_start: The starting case number for the test range (0-89). Default is 0
+- case_end: The ending case number for the test range (0-89). Default is 89
- test_type: The type of test to run (0 = Unit tests, 1 = Performance tests). Default is 0
- case_list: A list of specific case numbers to run. Must be used in conjunction with --test_type
- profiling: Run the tests with a profiler (YES/NO). Default is NO. This option is only available with HIP backend
@@ -121,7 +121,7 @@ python runTests.py --input_path1 --input_path2 --cas
- QA mode (Unit tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking pixelwise match between C/SSE/AVX/HIP versions after comparison to preset golden outputs. Please note that QA mode is only supported with a batch size of 3.
Note: QA mode is not supported for case 84 due to run-to-run variation of outputs.
``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 1 --batch_size 3
+python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 1 --batch_size 3
```
- QA mode (Performance tests) - Tolerance based PASS/FAIL tests for RPP HIP/HOST functionalities checking achieved improvement in performance percentage over BatchPD versions after comparison to a threshold percentage of improvement
``` python
@@ -131,13 +131,13 @@ python runTests.py --case_list 21 36 63 --test_type 1 --qa_mode 1 --batch_size 8
Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 0 --qa_mode 0
+python runTests.py --case_start 0 --case_end 89 --test_type 0 --qa_mode 0
```
- Performance test mode - Performance tests that execute the desired functionality and variant 100 times by default, and report max/min/avg RPP execution wall time, or optionally, AMD rocprof kernel profiler max/min/avg time for HIP backend variants.
Note: For testcase 82(RICAP) Please use images of same resolution and Batchsize > 1
RICAP dataset path: rpp/utilities/test_suite/TEST_IMAGES/three_images_150x150_src1
``` python
-python runTests.py --case_start 0 --case_end 87 --test_type 1
+python runTests.py --case_start 0 --case_end 89 --test_type 1
```
To run the unit tests / performance tests for specific case numbers. please case use case_list parameter. Example as below
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 2bc914af9..58fee0c5d 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -99,11 +99,27 @@ std::map augmentationMap =
{84, "spatter"},
{85, "swap_channels"},
{86, "color_to_greyscale"},
- {87, "tensor_sum"}
+ {87, "tensor_sum"},
+ {88, "tensor_min"},
+ {89, "tensor_max"},
+};
+
+// Golden outputs for Tensor min Kernel
+std::map> TensorMinReferenceOutputs =
+{
+ {1, {1, 1, 7}},
+ {3, {0, 0, 0, 0, 2, 0, 0, 0, 7, 9, 0, 0}}
+};
+
+// Golden outputs for Tensor max Kernel
+std::map> TensorMaxReferenceOutputs =
+{
+ {1, {239, 245, 255}},
+ {3, {255, 240, 236, 255, 255, 242, 241, 255, 253, 255, 255, 255}}
};
// Golden outputs for Tensor sum Kernel
-std::map> TensorSumReferenceOutputs =
+std::map> TensorSumReferenceOutputs =
{
{1, {334225, 813471, 2631125}},
{3, {348380, 340992, 262616, 951988, 1056552, 749506, 507441, 2313499, 2170646, 2732368, 3320699, 8223713}}
@@ -1118,11 +1134,19 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src
int matched_values = 0;
T *refOutput;
+ refOutput = (T *)calloc(srcDescPtr->n * 4, sizeof(T));
int numChannels = (srcDescPtr->c == 1) ? 1 : 3;
int numOutputs = (srcDescPtr->c == 1) ? srcDescPtr->n : srcDescPtr->n * 4;
- std::vector ref;
- if(testCase == 87)
- refOutput = TensorSumReferenceOutputs[numChannels].data();
+ std::vector ref;
+ if(testCase == 88)
+ ref = TensorMinReferenceOutputs[numChannels];
+ else if(testCase == 89)
+ ref = TensorMaxReferenceOutputs[numChannels];
+ else if(testCase == 87)
+ ref = TensorSumReferenceOutputs[numChannels];
+
+ for (int i = 0; i < numOutputs; i++)
+ refOutput[i] = (T)ref[i];
if(srcDescPtr->c == 1)
{
@@ -1148,6 +1172,7 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src
fileMatch++;
}
}
+ free(refOutput);
std::cout << std::endl << "Results for " << func << " :" << std::endl;
std::string status = func + ": ";
@@ -1172,6 +1197,14 @@ inline void compare_reduction_output(T* output, string funcName, RpptDescPtr src
}
}
+// print array of any bit depth for specified length
+template
+inline void print_array(T *src, Rpp32u length, Rpp32u precision)
+{
+ for (int i = 0; i < length; i++)
+ std::cout << " " << std::fixed << std::setprecision(precision) << static_cast(src[i]) << " ";
+}
+
// Used to randomly swap values present in array of size n
inline void randomize(unsigned int arr[], unsigned int n)
{