diff --git a/.Doxyfile b/.Doxyfile
index 77e66d9e9..066a53c02 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -967,8 +967,9 @@ INPUT = README.md \
include/rppt_tensor_geometric_augmentations.h \
include/rppt_tensor_morphological_operations.h \
include/rppt_tensor_statistical_operations.h \
- include/rppt_tensor_arithmetic_operations.h
- include/rppt_tensor_audio_augmentations.h
+ include/rppt_tensor_arithmetic_operations.h \
+ include/rppt_tensor_audio_augmentations.h \
+ include/rppt_tensor_logical_operations.h
# This tag can be used to specify the character encoding of the source files
diff --git a/docs/data/doxygenOutputs/logical_operations_bitwise_and_img150x150.png b/docs/data/doxygenOutputs/logical_operations_bitwise_and_img150x150.png
new file mode 100644
index 000000000..8ff97bd58
Binary files /dev/null and b/docs/data/doxygenOutputs/logical_operations_bitwise_and_img150x150.png differ
diff --git a/docs/data/doxygenOutputs/logical_operations_bitwise_or_img150x150.png b/docs/data/doxygenOutputs/logical_operations_bitwise_or_img150x150.png
new file mode 100644
index 000000000..d9dbccabb
Binary files /dev/null and b/docs/data/doxygenOutputs/logical_operations_bitwise_or_img150x150.png differ
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index f77dd11b2..2f633152e 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -966,7 +966,8 @@ INPUT = ../../README.md \
../../include/rppt_tensor_filter_augmentations.h \
../../include/rppt_tensor_geometric_augmentations.h \
../../include/rppt_tensor_morphological_operations.h \
- ../../include/rppt_tensor_statistical_operations.h
+ ../../include/rppt_tensor_statistical_operations.h \
+ ../../include/rppt_tensor_logical_operations.h
# This tag can be used to specify the character encoding of the source files
diff --git a/include/rppdefs.h b/include/rppdefs.h
index b12fcda78..c1eb322f8 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -311,7 +311,7 @@ typedef struct
int y;
} RppiPoint;
-/*! \brief RPPI Image 2D Rectangle (XYWH format) type struct
+/*! \brief RPPI Image 3D point type struct
* \ingroup group_rppdefs
*/
typedef struct
@@ -321,6 +321,9 @@ typedef struct
int z;
} RppiPoint3D;
+/*! \brief RPPI Image 2D Rectangle (XYWH format) type struct
+ * \ingroup group_rppdefs
+ */
typedef struct
{
int x;
@@ -373,7 +376,7 @@ typedef enum
XYWH // X-Y-Width-Height
} RpptRoiType;
-/*! \brief RPPT Tensor subpixel layout type enum
+/*! \brief RPPT Tensor 3D ROI type enum
* \ingroup group_rppdefs
*/
typedef enum
@@ -382,6 +385,9 @@ typedef enum
XYZWHD // X-Y-Z-Width-Height-Depth
} RpptRoi3DType;
+/*! \brief RPPT Tensor subpixel layout type enum
+ * \ingroup group_rppdefs
+ */
typedef enum
{
RGBtype,
@@ -493,7 +499,7 @@ typedef struct
RpptLayout layout;
} RpptDesc, *RpptDescPtr;
-/*! \brief RPPT Tensor 8-bit uchar RGB type struct
+/*! \brief RPPT Tensor Generic descriptor type struct
* \ingroup group_rppdefs
*/
typedef struct
@@ -506,6 +512,9 @@ typedef struct
RpptLayout layout;
} RpptGenericDesc, *RpptGenericDescPtr;
+/*! \brief RPPT Tensor 8-bit uchar RGB type struct
+ * \ingroup group_rppdefs
+ */
typedef struct
{
Rpp8u R;
diff --git a/include/rppt.h b/include/rppt.h
index 0a20921d8..b466fa373 100644
--- a/include/rppt.h
+++ b/include/rppt.h
@@ -46,6 +46,7 @@ extern "C" {
#include "rppt_tensor_arithmetic_operations.h"
#include "rppt_tensor_statistical_operations.h"
#include "rppt_tensor_audio_augmentations.h"
+#include "rppt_tensor_logical_operations.h"
#ifdef __cplusplus
}
diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index c03ee5de0..943a08b34 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -244,21 +244,21 @@ RppStatus rppt_multiply_scalar_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGeneri
RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
-/*! \brief Magnitude computation on HOST backend for a NCHW/NHWC layout tensor
+/*! \brief Magnitude computation on HIP backend for a NCHW/NHWC layout tensor
* \details This function computes magnitude of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
* srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
* dstPtr depth ranges - Will be same depth as srcPtr.
* \image html img150x150.png Sample Input1
* \image html img150x150_2.png Sample Input2
* \image html arithmetic_operations_magnitude_img150x150.png Sample Output
- * \param [in] srcPtr1 source1 tensor in HOST memory
- * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcPtr1 source1 tensor in HIP memory
+ * \param [in] srcPtr2 source2 tensor in HIP memory
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
- * \param [out] dstPtr destination tensor in HOST memory
+ * \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
- * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
- * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
* \return A \ref RppStatus enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
@@ -272,4 +272,4 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr src
#ifdef __cplusplus
}
#endif
-#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
\ No newline at end of file
+#endif // RPPT_TENSOR_ARITHMETIC_OPERATIONS_H
diff --git a/include/rppt_tensor_logical_operations.h b/include/rppt_tensor_logical_operations.h
new file mode 100644
index 000000000..29eefa466
--- /dev/null
+++ b/include/rppt_tensor_logical_operations.h
@@ -0,0 +1,139 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef RPPT_TENSOR_LOGICAL_OPERATIONS_H
+#define RPPT_TENSOR_LOGICAL_OPERATIONS_H
+
+#include "rpp.h"
+#include "rppdefs.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \file
+ * \brief RPPT Tensor Operations - Logical Operations.
+ * \defgroup group_tensor_logical_operations RPPT Tensor Operations - Logical Operations.
+ * \brief RPPT Tensor Operations - Logical Operations.
+ */
+
+/*! \addtogroup group_rppt_tensor_logical_operations
+ * @{
+ */
+
+/*! \brief Bitwise AND computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input1
+ * \image html img150x150_2.png Sample Input2
+ * \image html logical_operations_bitwise_and_img150x150.png Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Bitwise AND computation on HIP backend for a NCHW/NHWC layout tensor
+ * \details This function computes bitwise AND of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input1
+ * \image html img150x150_2.png Sample Input2
+ * \image html logical_operations_bitwise_and_img150x150.png Sample Output
+ * \param [in] srcPtr1 source1 tensor in HIP memory
+ * \param [in] srcPtr2 source2 tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! \brief Bitwise OR computation on HOST backend for a NCHW/NHWC layout tensor
+ * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input1
+ * \image html img150x150_2.png Sample Input2
+ * \image html logical_operations_bitwise_or_img150x150.png Sample Output
+ * \param [in] srcPtr1 source1 tensor in HOST memory
+ * \param [in] srcPtr2 source2 tensor in HOST memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Bitwise OR computation on HIP backend for a NCHW/NHWC layout tensor
+ * \details This function computes bitwise OR of corresponding pixels for a batch of RGB(3 channel) / greyscale(1 channel) images with an NHWC/NCHW tensor layout.
+ * srcPtr depth ranges - Rpp8u (0 to 255), Rpp16f (0 to 1), Rpp32f (0 to 1), Rpp8s (-128 to 127).
+ * dstPtr depth ranges - Will be same depth as srcPtr.
+ * \image html img150x150.png Sample Input1
+ * \image html img150x150_2.png Sample Input2
+ * \image html logical_operations_bitwise_or_img150x150.png Sample Output
+ * \param [in] srcPtr1 source1 tensor in HIP memory
+ * \param [in] srcPtr2 source2 tensor in HIP memory
+ * \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
+ * \param [in] roiTensorSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+ * \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+/*! @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+#endif // RPPT_TENSOR_LOGICAL_OPERATIONS_H
\ No newline at end of file
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index d03ec0e79..19121957b 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -719,6 +719,29 @@ inline void rpp_load48_i8pkd3_to_i8pln3(Rpp8s *srcPtr, __m128i *px)
px[2] = _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 */
}
+inline void rpp_load48_i8pkd3_to_u8pln3(Rpp8s *srcPtr, __m128i *px)
+{
+ __m128i pxSrc[8];
+ __m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15);
+ __m128i pxMaskRGB = _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15);
+
+ pxSrc[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */
+ pxSrc[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */
+ pxSrc[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24)); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 */
+ pxSrc[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36)); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 */
+ pxSrc[0] = _mm_shuffle_epi8(pxSrc[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */
+ pxSrc[1] = _mm_shuffle_epi8(pxSrc[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */
+ pxSrc[2] = _mm_shuffle_epi8(pxSrc[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */
+ pxSrc[3] = _mm_shuffle_epi8(pxSrc[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */
+ pxSrc[4] = _mm_unpacklo_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 lo-pixels of pxSrc[0] and pxSrc[1] */
+ pxSrc[5] = _mm_unpacklo_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 lo-pixels of pxSrc[2] and pxSrc[3] */
+ pxSrc[6] = _mm_unpackhi_epi8(pxSrc[0], pxSrc[1]); /* unpack 8 hi-pixels of pxSrc[0] and pxSrc[1] */
+ pxSrc[7] = _mm_unpackhi_epi8(pxSrc[2], pxSrc[3]); /* unpack 8 hi-pixels of pxSrc[2] and pxSrc[3] */
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[4] and pxSrc[5] to get R01-16 and add 128 to get u8 from i8 */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpackhi_epi8(pxSrc[4], pxSrc[5]), pxMaskRGB)); /* unpack 8 hi-pixels of pxSrc[4] and pxSrc[5] to get G01-16 and add 128 to get u8 from i8 */
+ px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_shuffle_epi8(_mm_unpacklo_epi8(pxSrc[6], pxSrc[7]), pxMaskRGB)); /* unpack 8 lo-pixels of pxSrc[6] and pxSrc[7] to get B01-16 and add 128 to get u8 from i8 */
+}
+
inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px)
{
_mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
@@ -726,6 +749,13 @@ inline void rpp_store48_i8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *
_mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
}
+inline void rpp_store48_u8pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128i *px)
+{
+ _mm_storeu_si128((__m128i *)dstPtrR, _mm_sub_epi8(px[0], xmm_pxConvertI8)); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+ _mm_storeu_si128((__m128i *)dstPtrG, _mm_sub_epi8(px[1], xmm_pxConvertI8)); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+ _mm_storeu_si128((__m128i *)dstPtrB, _mm_sub_epi8(px[2], xmm_pxConvertI8)); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+}
+
inline void rpp_load48_i8pkd3_to_i32pln3_avx(Rpp8s *srcPtr, __m256i *p)
{
__m128i pxSrc[8];
@@ -759,6 +789,13 @@ inline void rpp_load48_i8pln3_to_i8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *s
px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
}
+inline void rpp_load48_i8pln3_to_u8pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m128i *px)
+{
+ px[0] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrR)); /* load and convert to u8 [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+ px[1] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrG)); /* load and convert to u8 [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+ px[2] = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtrB)); /* load and convert to u8 [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+}
+
inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px)
{
__m128i pxDst[4];
@@ -774,6 +811,21 @@ inline void rpp_store48_i8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px)
_mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
}
+inline void rpp_store48_u8pln3_to_i8pkd3(Rpp8s *dstPtr, __m128i *px)
+{
+ __m128i pxDst[4];
+ __m128i pxZero = _mm_setzero_si128();
+ __m128i pxMaskRGBAtoRGB = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15);
+ pxDst[0] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] and pxZero */
+ pxDst[1] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] and pxZero */
+ pxDst[2] = _mm_unpacklo_epi8(px[0], px[2]); /* unpack 8 lo-pixels of px[0] and px[2] */
+ pxDst[3] = _mm_unpackhi_epi8(px[0], px[2]); /* unpack 8 hi-pixels of px[0] and px[2] */
+ _mm_storeu_si128((__m128i *)dstPtr, _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 12), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[2], pxDst[0]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 24), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpacklo_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+ _mm_storeu_si128((__m128i *)(dstPtr + 36), _mm_sub_epi8(_mm_shuffle_epi8(_mm_unpackhi_epi8(pxDst[3], pxDst[1]), pxMaskRGBAtoRGB), xmm_pxConvertI8)); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+}
+
inline void rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p)
{
__m128i px = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index d894d2efd..e2e7df9f6 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -1727,6 +1727,34 @@ __device__ __forceinline__ void rpp_hip_math_multiply24_const(d_float24 *src_f24
dst_f24->f4[5] = src_f24->f4[5] * multiplier_f4;
}
+// d_float8 bitwiseAND
+
+__device__ __forceinline__ void rpp_hip_math_bitwiseAnd8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ dst_f8->f1[0] = (float)((uchar)(src1_f8->f1[0]) & (uchar)(src2_f8->f1[0]));
+ dst_f8->f1[1] = (float)((uchar)(src1_f8->f1[1]) & (uchar)(src2_f8->f1[1]));
+ dst_f8->f1[2] = (float)((uchar)(src1_f8->f1[2]) & (uchar)(src2_f8->f1[2]));
+ dst_f8->f1[3] = (float)((uchar)(src1_f8->f1[3]) & (uchar)(src2_f8->f1[3]));
+ dst_f8->f1[4] = (float)((uchar)(src1_f8->f1[4]) & (uchar)(src2_f8->f1[4]));
+ dst_f8->f1[5] = (float)((uchar)(src1_f8->f1[5]) & (uchar)(src2_f8->f1[5]));
+ dst_f8->f1[6] = (float)((uchar)(src1_f8->f1[6]) & (uchar)(src2_f8->f1[6]));
+ dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) & (uchar)(src2_f8->f1[7]));
+}
+
+// d_float8 bitwiseOR
+
+__device__ __forceinline__ void rpp_hip_math_bitwiseOr8(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ dst_f8->f1[0] = (float)((uchar)(src1_f8->f1[0]) | (uchar)(src2_f8->f1[0]));
+ dst_f8->f1[1] = (float)((uchar)(src1_f8->f1[1]) | (uchar)(src2_f8->f1[1]));
+ dst_f8->f1[2] = (float)((uchar)(src1_f8->f1[2]) | (uchar)(src2_f8->f1[2]));
+ dst_f8->f1[3] = (float)((uchar)(src1_f8->f1[3]) | (uchar)(src2_f8->f1[3]));
+ dst_f8->f1[4] = (float)((uchar)(src1_f8->f1[4]) | (uchar)(src2_f8->f1[4]));
+ dst_f8->f1[5] = (float)((uchar)(src1_f8->f1[5]) | (uchar)(src2_f8->f1[5]));
+ dst_f8->f1[6] = (float)((uchar)(src1_f8->f1[6]) | (uchar)(src2_f8->f1[6]));
+ dst_f8->f1[7] = (float)((uchar)(src1_f8->f1[7]) | (uchar)(src2_f8->f1[7]));
+}
+
__device__ __forceinline__ float rpp_hip_math_inverse_sqrt1(float x)
{
float xHalf = 0.5f * x;
diff --git a/src/modules/cpu/host_tensor_logical_operations.hpp b/src/modules/cpu/host_tensor_logical_operations.hpp
new file mode 100644
index 000000000..0fb3fe5eb
--- /dev/null
+++ b/src/modules/cpu/host_tensor_logical_operations.hpp
@@ -0,0 +1,31 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HOST_TENSOR_LOGICAL_OPERATIONS_HPP
+#define HOST_TENSOR_LOGICAL_OPERATIONS_HPP
+
+#include "kernel/bitwise_and.hpp"
+#include "kernel/bitwise_or.hpp"
+
+#endif // HOST_TENSOR_LOGICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/cpu/kernel/bitwise_and.hpp b/src/modules/cpu/kernel/bitwise_and.hpp
new file mode 100644
index 000000000..6f1caf1d3
--- /dev/null
+++ b/src/modules/cpu/kernel/bitwise_and.hpp
@@ -0,0 +1,965 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus bitwise_and_u8_u8_host_tensor(Rpp8u *srcPtr1,
+ Rpp8u *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+
+ // Bitwise AND with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation
+ p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation
+ p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation
+ rpp_simd_store(rpp_store48_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = srcPtr1Temp[0] & srcPtr2Temp[0];
+ *dstPtrTempG++ = srcPtr1Temp[1] & srcPtr2Temp[1];
+ *dstPtrTempB++ = srcPtr1Temp[2] & srcPtr2Temp[2];
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation
+ p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation
+ p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation
+ rpp_simd_store(rpp_store48_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = *srcPtr1TempR & *srcPtr2TempR;
+ dstPtrTemp[1] = *srcPtr1TempG & *srcPtr2TempG;
+ dstPtrTemp[2] = *srcPtr1TempB & *srcPtr2TempB;
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+ alignedLength = bufferLength & ~15;
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1, p2;
+
+ p1 = _mm_loadu_si128((__m128i *)srcPtr1Temp); // simd loads
+ p2 = _mm_loadu_si128((__m128i *)srcPtr2Temp); // simd loads
+ p1 = _mm_and_si128(p1, p2); // bitwise_and computation
+ _mm_storeu_si128((__m128i *)dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtr1Temp & *srcPtr2Temp;
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+/* BitwiseAND is logical operation only on U8/I8 types.
+ For a Rpp32f precision image (pixel values from 0-1), the BitwiseAND is applied on a 0-255
+ range-translated approximation, of the original 0-1 decimal-range image.
+ Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */
+RppStatus bitwise_and_f32_f32_host_tensor(Rpp32f *srcPtr1,
+ Rpp32f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Bitwise AND with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) & (uint)(srcPtr2Temp[0] * 255)) / 255);
+ *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) & (uint)(srcPtr2Temp[1] * 255)) / 255);
+ *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) & (uint)(srcPtr2Temp[2] * 255)) / 255);
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) & (uint)(*srcPtr2TempR * 255)) / 255);
+ dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) & (uint)(*srcPtr2TempG * 255)) / 255);
+ dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) & (uint)(*srcPtr2TempB * 255)) / 255);
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) & (uint)(*srcPtr2Temp * 255)) / 255);
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus bitwise_and_f16_f16_host_tensor(Rpp16f *srcPtr1,
+ Rpp16f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Bitwise AND with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]);
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]);
+ }
+
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) & (uint)(srcPtr2Temp[0] * 255)) / 255));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) & (uint)(srcPtr2Temp[1] * 255)) / 255));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) & (uint)(srcPtr2Temp[2] * 255)) / 255));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]);
+ srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]);
+ srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]);
+
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]);
+ srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]);
+ srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]);
+ }
+
+ __m256 p1[4], p2[4];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_and computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_and computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) & (uint)(*srcPtr2TempR * 255)) / 255));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) & (uint)(*srcPtr2TempG * 255)) / 255));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) & (uint)(*srcPtr2TempB * 255)) / 255));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8];
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]);
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]);
+ }
+
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_and computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) & (uint)(*srcPtr2Temp * 255)) / 255));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus bitwise_and_i8_i8_host_tensor(Rpp8s *srcPtr1,
+ Rpp8s *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+
+ // Bitwise AND with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation
+ p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation
+ p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation
+ rpp_simd_store(rpp_store48_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) & (srcPtr2Temp[0] + 128)) - 128));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) & (srcPtr2Temp[1] + 128)) - 128));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) & (srcPtr2Temp[2] + 128)) - 128));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm_and_si128(p1[0], p2[0]); // bitwise_and computation
+ p1[1] = _mm_and_si128(p1[1], p2[1]); // bitwise_and computation
+ p1[2] = _mm_and_si128(p1[2], p2[2]); // bitwise_and computation
+ rpp_simd_store(rpp_store48_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores
+
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) & static_cast(*srcPtr2TempR + 128)))) - 128));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) & static_cast(*srcPtr2TempG + 128)))) - 128));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) & static_cast(*srcPtr2TempB + 128)))) - 128));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise AND without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+ alignedLength = bufferLength & ~15;
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1, p2;
+
+ p1 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr1Temp)); // simd loads
+ p2 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr2Temp)); // simd loads
+ p1 = _mm_and_si128(p1, p2); // bitwise_and computation
+ _mm_storeu_si128((__m128i *)dstPtrTemp, _mm_sub_epi8(p1, xmm_pxConvertI8)); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) & static_cast(*srcPtr2Temp + 128)))) - 128));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/cpu/kernel/bitwise_or.hpp b/src/modules/cpu/kernel/bitwise_or.hpp
new file mode 100644
index 000000000..40803e933
--- /dev/null
+++ b/src/modules/cpu/kernel/bitwise_or.hpp
@@ -0,0 +1,965 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software OR associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, OR/or sell
+copies of the Software, OR to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice OR this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE OR NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_simd.hpp"
+#include "rpp_cpu_common.hpp"
+
+RppStatus bitwise_or_u8_u8_host_tensor(Rpp8u *srcPtr1,
+ Rpp8u *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8u *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8u *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8u *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+
+ // Bitwise OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load48_u8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation
+ p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation
+ p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation
+ rpp_simd_store(rpp_store48_u8pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = srcPtr1Temp[0] | srcPtr2Temp[0];
+ *dstPtrTempG++ = srcPtr1Temp[1] | srcPtr2Temp[1];
+ *dstPtrTempB++ = srcPtr1Temp[2] | srcPtr2Temp[2];
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8u *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load48_u8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation
+ p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation
+ p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation
+ rpp_simd_store(rpp_store48_u8pln3_to_u8pkd3, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = *srcPtr1TempR | *srcPtr2TempR;
+ dstPtrTemp[1] = *srcPtr1TempG | *srcPtr2TempG;
+ dstPtrTemp[2] = *srcPtr1TempB | *srcPtr2TempB;
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+ alignedLength = bufferLength & ~15;
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8u *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8u *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1, p2;
+
+ p1 = _mm_loadu_si128((__m128i *)srcPtr1Temp); // simd loads
+ p2 = _mm_loadu_si128((__m128i *)srcPtr2Temp); // simd loads
+ p1 = _mm_or_si128(p1, p2); // bitwise_or computation
+ _mm_storeu_si128((__m128i *)dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = *srcPtr1Temp | *srcPtr2Temp;
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+/* BitwiseOR is logical operation only on U8/I8 types.
+ For a Rpp32f precision image (pixel values from 0-1), the BitwiseOR is applied on a 0-255
+ range-translated approximation, of the original 0-1 decimal-range image.
+ Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */
+RppStatus bitwise_or_f32_f32_host_tensor(Rpp32f *srcPtr1,
+ Rpp32f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp32f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp32f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp32f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Bitwise OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) | (uint)(srcPtr2Temp[0] * 255)) / 255);
+ *dstPtrTempG++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) | (uint)(srcPtr2Temp[1] * 255)) / 255);
+ *dstPtrTempB++ = RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) | (uint)(srcPtr2Temp[2] * 255)) / 255);
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp32f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) | (uint)(*srcPtr2TempR * 255)) / 255);
+ dstPtrTemp[1] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) | (uint)(*srcPtr2TempG * 255)) / 255);
+ dstPtrTemp[2] = RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) | (uint)(*srcPtr2TempB * 255)) / 255);
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp32f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp32f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) | (uint)(*srcPtr2Temp * 255)) / 255);
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus bitwise_or_f16_f16_host_tensor(Rpp16f *srcPtr1,
+ Rpp16f *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp16f *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp16f *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp16f *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+#if __AVX2__
+ Rpp32u alignedLength = (bufferLength / 24) * 24;
+ Rpp32u vectorIncrement = 24;
+ Rpp32u vectorIncrementPerChannel = 8;
+#endif
+
+ // Bitwise OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]);
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]);
+ }
+
+ __m256 p1[3], p2[3];
+
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr1Temp_ps, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtr2Temp_ps, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[0] * 255) | (uint)(srcPtr2Temp[0] * 255)) / 255));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[1] * 255) | (uint)(srcPtr2Temp[1] * 255)) / 255));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKF32((float)((uint)(srcPtr1Temp[2] * 255) | (uint)(srcPtr2Temp[2] * 255)) / 255));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp16f *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtr1Temp_ps[24], srcPtr2Temp_ps[24];
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1TempR[cnt]);
+ srcPtr1Temp_ps[cnt + 8] = static_cast(srcPtr1TempG[cnt]);
+ srcPtr1Temp_ps[cnt + 16] = static_cast(srcPtr1TempB[cnt]);
+
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2TempR[cnt]);
+ srcPtr2Temp_ps[cnt + 8] = static_cast(srcPtr2TempG[cnt]);
+ srcPtr2Temp_ps[cnt + 16] = static_cast(srcPtr2TempB[cnt]);
+ }
+
+ __m256 p1[4], p2[4];
+
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr1Temp_ps, srcPtr1Temp_ps + 8, srcPtr1Temp_ps + 16, p1); // simd loads
+ rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtr2Temp_ps, srcPtr2Temp_ps + 8, srcPtr2Temp_ps + 16, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation
+ p1[1] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[1], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[1], avx_p255)))); // bitwise_or computation
+ p1[2] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[2], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[2], avx_p255)))); // bitwise_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ p1[1] = _mm256_mul_ps(p1[1], avx_p1op255);
+ p1[2] = _mm256_mul_ps(p1[2], avx_p1op255);
+ rpp_simd_store(rpp_store24_f32pln3_to_f16pkd3_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempR * 255) | (uint)(*srcPtr2TempR * 255)) / 255));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempG * 255) | (uint)(*srcPtr2TempG * 255)) / 255));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1TempB * 255) | (uint)(*srcPtr2TempB * 255)) / 255));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+#if __AVX2__
+ alignedLength = bufferLength & ~7;
+#endif
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp16f *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp16f *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ Rpp32f srcPtr1Temp_ps[8], srcPtr2Temp_ps[8];
+
+ for(int cnt = 0; cnt < vectorIncrementPerChannel; cnt++)
+ {
+ srcPtr1Temp_ps[cnt] = static_cast(srcPtr1Temp[cnt]);
+ srcPtr2Temp_ps[cnt] = static_cast(srcPtr2Temp[cnt]);
+ }
+
+ __m256 p1[1], p2[1];
+
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr1Temp_ps, p1); // simd loads
+ rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtr2Temp_ps, p2); // simd loads
+ p1[0] = _mm256_cvtepi32_ps(_mm256_or_si256(_mm256_cvttps_epi32(_mm256_mul_ps(p1[0], avx_p255)), _mm256_cvttps_epi32(_mm256_mul_ps(p2[0], avx_p255)))); // bitwise_or computation
+ p1[0] = _mm256_mul_ps(p1[0], avx_p1op255);
+ rpp_simd_store(rpp_store8_f32_to_f16_avx, dstPtrTemp, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+#endif
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKF32((float)((uint)(*srcPtr1Temp * 255) | (uint)(*srcPtr2Temp * 255)) / 255));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus bitwise_or_i8_i8_host_tensor(Rpp8s *srcPtr1,
+ Rpp8s *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ Rpp8s *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ RppLayoutParams layoutParams,
+ rpp::Handle& Handle)
+{
+ RpptROI roiDefault = {0, 0, (Rpp32s)srcDescPtr->w, (Rpp32s)srcDescPtr->h};
+ Rpp32u numThreads = Handle.GetNumThreads();
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < dstDescPtr->n; batchCount++)
+ {
+ RpptROI roi;
+ RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+ compute_roi_validation_host(roiPtrInput, &roi, &roiDefault, roiType);
+
+ Rpp8s *srcPtr1Image, *srcPtr2Image, *dstPtrImage;
+ srcPtr1Image = srcPtr1 + batchCount * srcDescPtr->strides.nStride;
+ srcPtr2Image = srcPtr2 + batchCount * srcDescPtr->strides.nStride;
+ dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+ Rpp32u bufferLength = roi.xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+ Rpp8s *srcPtr1Channel, *srcPtr2Channel, *dstPtrChannel;
+ srcPtr1Channel = srcPtr1Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ srcPtr2Channel = srcPtr2Image + (roi.xywhROI.xy.y * srcDescPtr->strides.hStride) + (roi.xywhROI.xy.x * layoutParams.bufferMultiplier);
+ dstPtrChannel = dstPtrImage;
+
+ Rpp32u alignedLength = (bufferLength / 48) * 48;
+ Rpp32u vectorIncrement = 48;
+ Rpp32u vectorIncrementPerChannel = 16;
+
+ // Bitwise OR with fused output-layout toggle (NHWC -> NCHW)
+ if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRowR = dstPtrChannel;
+ dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+ dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTempR = dstPtrRowR;
+ dstPtrTempG = dstPtrRowG;
+ dstPtrTempB = dstPtrRowB;
+
+ int vectorLoopCount = 0;
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr1Temp, p1); // simd loads
+ rpp_simd_load(rpp_load48_i8pkd3_to_u8pln3, srcPtr2Temp, p2); // simd loads
+ p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation
+ p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation
+ p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation
+ rpp_simd_store(rpp_store48_u8pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores
+
+ srcPtr1Temp += vectorIncrement;
+ srcPtr2Temp += vectorIncrement;
+ dstPtrTempR += vectorIncrementPerChannel;
+ dstPtrTempG += vectorIncrementPerChannel;
+ dstPtrTempB += vectorIncrementPerChannel;
+ }
+
+ for (; vectorLoopCount < bufferLength; vectorLoopCount += 3)
+ {
+ *dstPtrTempR++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[0] + 128) | (srcPtr2Temp[0] + 128)) - 128));
+ *dstPtrTempG++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[1] + 128) | (srcPtr2Temp[1] + 128)) - 128));
+ *dstPtrTempB++ = static_cast(RPPPIXELCHECKI8(((srcPtr1Temp[2] + 128) | (srcPtr2Temp[2] + 128)) - 128));
+
+ srcPtr1Temp += 3;
+ srcPtr2Temp += 3;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRowR += dstDescPtr->strides.hStride;
+ dstPtrRowG += dstDescPtr->strides.hStride;
+ dstPtrRowB += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR with fused output-layout toggle (NCHW -> NHWC)
+ else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ Rpp8s *srcPtr1RowR, *srcPtr1RowG, *srcPtr1RowB, *srcPtr2RowR, *srcPtr2RowG, *srcPtr2RowB, *dstPtrRow;
+ srcPtr1RowR = srcPtr1Channel;
+ srcPtr1RowG = srcPtr1RowR + srcDescPtr->strides.cStride;
+ srcPtr1RowB = srcPtr1RowG + srcDescPtr->strides.cStride;
+ srcPtr2RowR = srcPtr2Channel;
+ srcPtr2RowG = srcPtr2RowR + srcDescPtr->strides.cStride;
+ srcPtr2RowB = srcPtr2RowG + srcDescPtr->strides.cStride;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1TempR, *srcPtr1TempG, *srcPtr1TempB, *srcPtr2TempR, *srcPtr2TempG, *srcPtr2TempB, *dstPtrTemp;
+ srcPtr1TempR = srcPtr1RowR;
+ srcPtr1TempG = srcPtr1RowG;
+ srcPtr1TempB = srcPtr1RowB;
+ srcPtr2TempR = srcPtr2RowR;
+ srcPtr2TempG = srcPtr2RowG;
+ srcPtr2TempB = srcPtr2RowB;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1[3], p2[3];
+
+ rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr1TempR, srcPtr1TempG, srcPtr1TempB, p1); // simd loads
+ rpp_simd_load(rpp_load48_i8pln3_to_u8pln3, srcPtr2TempR, srcPtr2TempG, srcPtr2TempB, p2); // simd loads
+ p1[0] = _mm_or_si128(p1[0], p2[0]); // bitwise_or computation
+ p1[1] = _mm_or_si128(p1[1], p2[1]); // bitwise_or computation
+ p1[2] = _mm_or_si128(p1[2], p2[2]); // bitwise_or computation
+ rpp_simd_store(rpp_store48_u8pln3_to_i8pkd3, dstPtrTemp, p1); // simd stores
+
+
+ srcPtr1TempR += vectorIncrementPerChannel;
+ srcPtr1TempG += vectorIncrementPerChannel;
+ srcPtr1TempB += vectorIncrementPerChannel;
+ srcPtr2TempR += vectorIncrementPerChannel;
+ srcPtr2TempG += vectorIncrementPerChannel;
+ srcPtr2TempB += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrement;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ dstPtrTemp[0] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempR + 128) | static_cast(*srcPtr2TempR + 128)))) - 128));
+ dstPtrTemp[1] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempG + 128) | static_cast(*srcPtr2TempG + 128)))) - 128));
+ dstPtrTemp[2] = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1TempB + 128) | static_cast(*srcPtr2TempB + 128)))) - 128));
+
+ srcPtr1TempR++;
+ srcPtr1TempG++;
+ srcPtr1TempB++;
+ srcPtr2TempR++;
+ srcPtr2TempG++;
+ srcPtr2TempB++;
+ dstPtrTemp += 3;
+ }
+
+ srcPtr1RowR += srcDescPtr->strides.hStride;
+ srcPtr1RowG += srcDescPtr->strides.hStride;
+ srcPtr1RowB += srcDescPtr->strides.hStride;
+ srcPtr2RowR += srcDescPtr->strides.hStride;
+ srcPtr2RowG += srcDescPtr->strides.hStride;
+ srcPtr2RowB += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+ }
+
+ // Bitwise OR without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+ else
+ {
+ alignedLength = bufferLength & ~15;
+
+ for(int c = 0; c < layoutParams.channelParam; c++)
+ {
+ Rpp8s *srcPtr1Row, *srcPtr2Row, *dstPtrRow;
+ srcPtr1Row = srcPtr1Channel;
+ srcPtr2Row = srcPtr2Channel;
+ dstPtrRow = dstPtrChannel;
+
+ for(int i = 0; i < roi.xywhROI.roiHeight; i++)
+ {
+ Rpp8s *srcPtr1Temp, *srcPtr2Temp, *dstPtrTemp;
+ srcPtr1Temp = srcPtr1Row;
+ srcPtr2Temp = srcPtr2Row;
+ dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrementPerChannel)
+ {
+ __m128i p1, p2;
+
+ p1 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr1Temp)); // simd loads
+ p2 = _mm_add_epi8(xmm_pxConvertI8, _mm_loadu_si128((__m128i *)srcPtr2Temp)); // simd loads
+ p1 = _mm_or_si128(p1, p2); // bitwise_or computation
+ _mm_storeu_si128((__m128i *)dstPtrTemp, _mm_sub_epi8(p1, xmm_pxConvertI8)); // simd stores
+
+ srcPtr1Temp += vectorIncrementPerChannel;
+ srcPtr2Temp += vectorIncrementPerChannel;
+ dstPtrTemp += vectorIncrementPerChannel;
+ }
+ for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+ {
+ *dstPtrTemp++ = static_cast(RPPPIXELCHECKI8(((static_cast((*srcPtr1Temp + 128) | static_cast(*srcPtr2Temp + 128)))) - 128));
+
+ srcPtr1Temp++;
+ srcPtr2Temp++;
+ }
+
+ srcPtr1Row += srcDescPtr->strides.hStride;
+ srcPtr2Row += srcDescPtr->strides.hStride;
+ dstPtrRow += dstDescPtr->strides.hStride;
+ }
+
+ srcPtr1Channel += srcDescPtr->strides.cStride;
+ srcPtr2Channel += srcDescPtr->strides.cStride;
+ dstPtrChannel += dstDescPtr->strides.cStride;
+ }
+ }
+ }
+
+ return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/hip_tensor_logical_operations.hpp b/src/modules/hip/hip_tensor_logical_operations.hpp
new file mode 100644
index 000000000..636789246
--- /dev/null
+++ b/src/modules/hip/hip_tensor_logical_operations.hpp
@@ -0,0 +1,31 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef HIP_TENSOR_LOGICAL_OPERATIONS_HPP
+#define HIP_TENSOR_LOGICAL_OPERATIONS_HPP
+
+#include "kernel/bitwise_and.hpp"
+#include "kernel/bitwise_or.hpp"
+
+#endif // HIP_TENSOR_LOGICAL_OPERATIONS_HPP
\ No newline at end of file
diff --git a/src/modules/hip/kernel/bitwise_and.hpp b/src/modules/hip/kernel/bitwise_and.hpp
new file mode 100644
index 000000000..ca9f30c11
--- /dev/null
+++ b/src/modules/hip/kernel/bitwise_and.hpp
@@ -0,0 +1,247 @@
+#include
+#include "rpp_hip_common.hpp"
+
+/* BitwiseAND is logical operation only on U8/I8 types.
+ For a Rpp32f precision image (pixel values from 0-1), the BitwiseAND is applied on a 0-255
+ range-translated approximation, of the original 0-1 decimal-range image.
+ Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */
+template
+__device__ void bitwise_and_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ if constexpr ((std::is_same::value) || (std::is_same::value))
+ {
+ rpp_hip_math_multiply8_const(src1_f8, src1_f8, (float4)255);
+ rpp_hip_math_multiply8_const(src2_f8, src2_f8, (float4)255);
+ rpp_hip_math_bitwiseAnd8(src1_f8, src2_f8, dst_f8);
+ rpp_hip_math_multiply8_const(dst_f8, dst_f8, (float4)ONE_OVER_255);
+ }
+ else if constexpr (std::is_same::value)
+ {
+ rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128);
+ rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128);
+ rpp_hip_math_bitwiseAnd8(src1_f8, src2_f8, dst_f8);
+ rpp_hip_math_subtract8_const(dst_f8, dst_f8, (float4)128);
+ }
+ else
+ rpp_hip_math_bitwiseAnd8(src1_f8, src2_f8, dst_f8);
+}
+
+template
+__global__ void bitwise_and_pkd_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void bitwise_and_pln_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float8 src1_f8, src2_f8, dst_f8;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ bitwise_and_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ bitwise_and_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ bitwise_and_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void bitwise_and_pkd3_pln3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void bitwise_and_pln3_pkd3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24);
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ bitwise_and_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_bitwise_and_tensor(T *srcPtr1,
+ T *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(bitwise_and_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(bitwise_and_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(bitwise_and_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+ hipLaunchKernelGGL(bitwise_and_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/bitwise_or.hpp b/src/modules/hip/kernel/bitwise_or.hpp
new file mode 100644
index 000000000..ab0c962ef
--- /dev/null
+++ b/src/modules/hip/kernel/bitwise_or.hpp
@@ -0,0 +1,247 @@
+#include
+#include "rpp_hip_common.hpp"
+
+/* BitwiseOR is logical operation only on U8/I8 types.
+ For a Rpp32f precision image (pixel values from 0-1), the BitwiseOR is applied on a 0-255
+ range-translated approximation, of the original 0-1 decimal-range image.
+ Link: https://stackoverflow.com/questions/1723575/how-to-perform-a-bitwise-operation-on-floating-point-numbers */
+template
+__device__ void bitwise_or_hip_compute(T *srcPtr, d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8)
+{
+ if constexpr ((std::is_same::value) || (std::is_same::value))
+ {
+ rpp_hip_math_multiply8_const(src1_f8, src1_f8, (float4)255);
+ rpp_hip_math_multiply8_const(src2_f8, src2_f8, (float4)255);
+ rpp_hip_math_bitwiseOr8(src1_f8, src2_f8, dst_f8);
+ rpp_hip_math_multiply8_const(dst_f8, dst_f8, (float4)ONE_OVER_255);
+ }
+ else if constexpr (std::is_same::value)
+ {
+ rpp_hip_math_add8_const(src1_f8, src1_f8, (float4)128);
+ rpp_hip_math_add8_const(src2_f8, src2_f8, (float4)128);
+ rpp_hip_math_bitwiseOr8(src1_f8, src2_f8, dst_f8);
+ rpp_hip_math_subtract8_const(dst_f8, dst_f8, (float4)128);
+ }
+ else
+ rpp_hip_math_bitwiseOr8(src1_f8, src2_f8, dst_f8);
+}
+
+template
+__global__ void bitwise_or_pkd_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+__global__ void bitwise_or_pln_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ int channelsDst,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float8 src1_f8, src2_f8, dst_f8;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ bitwise_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ if (channelsDst == 3)
+ {
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ bitwise_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+
+ srcIdx += srcStridesNCH.y;
+ dstIdx += dstStridesNCH.y;
+
+ rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
+ rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
+ bitwise_or_hip_compute(srcPtr1, &src1_f8, &src2_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+ }
+}
+
+template
+__global__ void bitwise_or_pkd3_pln3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint2 srcStridesNH,
+ T *dstPtr,
+ uint3 dstStridesNCH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+ uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
+ rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
+}
+
+template
+__global__ void bitwise_or_pln3_pkd3_hip_tensor(T *srcPtr1,
+ T *srcPtr2,
+ uint3 srcStridesNCH,
+ T *dstPtr,
+ uint2 dstStridesNH,
+ RpptROIPtr roiTensorPtrSrc)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+ if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+ {
+ return;
+ }
+
+ uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;
+
+ d_float24 src1_f24, src2_f24, dst_f24;
+
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24);
+ rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[0], &src2_f24.f8[0], &dst_f24.f8[0]);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[1], &src2_f24.f8[1], &dst_f24.f8[1]);
+ bitwise_or_hip_compute(srcPtr1, &src1_f24.f8[2], &src2_f24.f8[2], &dst_f24.f8[2]);
+ rpp_hip_pack_float24_pkd3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
+}
+
+template
+RppStatus hip_exec_bitwise_or_tensor(T *srcPtr1,
+ T *srcPtr2,
+ RpptDescPtr srcDescPtr,
+ T *dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rpp::Handle& handle)
+{
+ if (roiType == RpptRoiType::LTRB)
+ hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);
+
+ int globalThreads_x = (dstDescPtr->w + 7) >> 3;
+ int globalThreads_y = dstDescPtr->h;
+ int globalThreads_z = handle.GetBatchSize();
+
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ hipLaunchKernelGGL(bitwise_or_pkd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(bitwise_or_pln_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ dstDescPtr->c,
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+ {
+ if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+ {
+ hipLaunchKernelGGL(bitwise_or_pkd3_pln3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+ {
+ globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+ hipLaunchKernelGGL(bitwise_or_pln3_pkd3_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr1,
+ srcPtr2,
+ make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
+ dstPtr,
+ make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
+ roiTensorPtrSrc);
+ }
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_logical_operations.cpp b/src/modules/rppt_tensor_logical_operations.cpp
new file mode 100644
index 000000000..7d28fe96b
--- /dev/null
+++ b/src/modules/rppt_tensor_logical_operations.cpp
@@ -0,0 +1,300 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rppi_validate.hpp"
+#include "rppt_tensor_logical_operations.h"
+#include "cpu/host_tensor_logical_operations.hpp"
+
+#ifdef HIP_COMPILE
+ #include
+ #include "hip/hip_tensor_logical_operations.hpp"
+#endif // HIP_COMPILE
+
+/******************** bitwise AND ********************/
+
+RppStatus rppt_bitwise_and_host(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ bitwise_and_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ bitwise_and_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ bitwise_and_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ bitwise_and_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+/******************** bitwise OR ********************/
+
+RppStatus rppt_bitwise_or_host(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+ RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ bitwise_or_u8_u8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ bitwise_or_f16_f16_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ bitwise_or_f32_f32_host_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ bitwise_or_i8_i8_host_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ layoutParams,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
+
+/********************************************************************************************************************/
+/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
+/********************************************************************************************************************/
+
+#ifdef GPU_SUPPORT
+
+/******************** bitwise AND ********************/
+
+RppStatus rppt_bitwise_and_gpu(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_bitwise_and_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_bitwise_and_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_bitwise_and_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_bitwise_and_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+/******************** bitwise OR ********************/
+
+RppStatus rppt_bitwise_or_gpu(RppPtr_t srcPtr1,
+ RppPtr_t srcPtr2,
+ RpptDescPtr srcDescPtr,
+ RppPtr_t dstPtr,
+ RpptDescPtr dstDescPtr,
+ RpptROIPtr roiTensorPtrSrc,
+ RpptRoiType roiType,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+
+ if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+ {
+ hip_exec_bitwise_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_bitwise_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_bitwise_or_tensor(reinterpret_cast(static_cast(srcPtr1) + srcDescPtr->offsetInBytes),
+ reinterpret_cast(static_cast(srcPtr2) + srcDescPtr->offsetInBytes),
+ srcDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstDescPtr->offsetInBytes),
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+ {
+ hip_exec_bitwise_or_tensor(static_cast(srcPtr1) + srcDescPtr->offsetInBytes,
+ static_cast(srcPtr2) + srcDescPtr->offsetInBytes,
+ srcDescPtr,
+ static_cast(dstPtr) + dstDescPtr->offsetInBytes,
+ dstDescPtr,
+ roiTensorPtrSrc,
+ roiType,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
+#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/HIP/Tensor_hip.cpp b/utilities/test_suite/HIP/Tensor_hip.cpp
index c2f7665f2..2743c4e1f 100644
--- a/utilities/test_suite/HIP/Tensor_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_hip.cpp
@@ -65,7 +65,7 @@ int main(int argc, char **argv)
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23|| testCase == 24 || testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
bool kernelSizeCase = (testCase == 40 || testCase == 41 || testCase == 49 || testCase == 54);
- bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
+ bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
bool randomOutputCase = (testCase == 84 || testCase == 49 || testCase == 54);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
@@ -903,6 +903,30 @@ int main(int argc, char **argv)
break;
}
+ case 65:
+ {
+ testCaseName = "bitwise_and";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_bitwise_and_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 68:
+ {
+ testCaseName = "bitwise_or";
+
+ startWallTime = omp_get_wtime();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_bitwise_or_gpu(d_input, d_input_second, srcDescPtr, d_output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 70:
{
testCaseName = "copy";
diff --git a/utilities/test_suite/HIP/runTests.py b/utilities/test_suite/HIP/runTests.py
index 4dc9023f7..14fb0270a 100644
--- a/utilities/test_suite/HIP/runTests.py
+++ b/utilities/test_suite/HIP/runTests.py
@@ -63,8 +63,10 @@ def func_group_finder(case_number):
return "filter_augmentations"
elif case_number < 40:
return "geometric_augmentations"
- elif case_number == 61:
+ elif case_number < 62:
return "arithmetic_operations"
+ elif case_number < 69:
+ return "logical_operations"
elif case_number < 87:
return "data_exchange_operations"
elif case_number < 88:
@@ -325,7 +327,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.run(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '82', '83', '84', '85', '86', '87', '88', '89']
# Create folders based on testType and profilingOption
if testType == 1 and profilingOption == "YES":
@@ -517,6 +519,7 @@ def rpp_test_suite_parser_and_validator():
"geometric_augmentations",
"morphological_operations",
"arithmetic_operations",
+ "logical_operations",
"statistical_operations"
]
for log_file in log_file_list:
diff --git a/utilities/test_suite/HOST/Tensor_host.cpp b/utilities/test_suite/HOST/Tensor_host.cpp
index ffc9c0d3a..5c9d3c7dc 100644
--- a/utilities/test_suite/HOST/Tensor_host.cpp
+++ b/utilities/test_suite/HOST/Tensor_host.cpp
@@ -65,7 +65,7 @@ int main(int argc, char **argv)
int batchSize = atoi(argv[14]);
bool additionalParamCase = (testCase == 8 || testCase == 21 || testCase == 23 || testCase == 24);
- bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63);
+ bool dualInputCase = (testCase == 2 || testCase == 30 || testCase == 61 || testCase == 63 || testCase == 65 || testCase == 68);
bool randomOutputCase = (testCase == 84);
bool interpolationTypeCase = (testCase == 21 || testCase == 23 || testCase == 24);
bool reductionTypeCase = (testCase == 87 || testCase == 88 || testCase == 89);
@@ -875,6 +875,32 @@ int main(int argc, char **argv)
break;
}
+ case 65:
+ {
+ testCaseName = "bitwise_and";
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_bitwise_and_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
+ case 68:
+ {
+ testCaseName = "bitwise_or";
+
+ startWallTime = omp_get_wtime();
+ startCpuTime = clock();
+ if (inputBitDepth == 0 || inputBitDepth == 1 || inputBitDepth == 2 || inputBitDepth == 5)
+ rppt_bitwise_or_host(input, input_second, srcDescPtr, output, dstDescPtr, roiTensorPtrSrc, roiTypeSrc, handle);
+ else
+ missingFuncFlag = 1;
+
+ break;
+ }
case 70:
{
testCaseName = "copy";
diff --git a/utilities/test_suite/HOST/runTests.py b/utilities/test_suite/HOST/runTests.py
index f106a9649..00ae0a48e 100644
--- a/utilities/test_suite/HOST/runTests.py
+++ b/utilities/test_suite/HOST/runTests.py
@@ -56,10 +56,12 @@ def func_group_finder(case_number):
return "color_augmentations"
elif case_number == 8 or case_number == 30 or case_number == 82 or case_number == 83 or case_number == 84:
return "effects_augmentations"
- elif case_number < 40:
+ elif case_number < 40 or case_number == 63:
return "geometric_augmentations"
- elif case_number == 61:
+ elif case_number < 62:
return "arithmetic_operations"
+ elif case_number < 69:
+ return "logical_operations"
elif case_number < 87:
return "data_exchange_operations"
elif case_number < 88:
@@ -281,7 +283,7 @@ def rpp_test_suite_parser_and_validator():
subprocess.run(["make", "-j16"], cwd=".") # nosec
# List of cases supported
-supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
+supportedCaseList = ['0', '1', '2', '4', '8', '13', '20', '21', '23', '29', '30', '31', '34', '36', '37', '38', '39', '45', '46', '54', '61', '63', '65', '68', '70', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
print("\n\n\n\n\n")
print("##########################################################################################")
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/bitwise_and/bitwise_and_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_and/bitwise_and_u8_Tensor.bin
new file mode 100644
index 000000000..952959dc5
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_and/bitwise_and_u8_Tensor.bin differ
diff --git a/utilities/test_suite/REFERENCE_OUTPUT/bitwise_or/bitwise_or_u8_Tensor.bin b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_or/bitwise_or_u8_Tensor.bin
new file mode 100644
index 000000000..2d1076614
Binary files /dev/null and b/utilities/test_suite/REFERENCE_OUTPUT/bitwise_or/bitwise_or_u8_Tensor.bin differ
diff --git a/utilities/test_suite/rpp_test_suite_common.h b/utilities/test_suite/rpp_test_suite_common.h
index 55fc90abf..b1b12fdd4 100644
--- a/utilities/test_suite/rpp_test_suite_common.h
+++ b/utilities/test_suite/rpp_test_suite_common.h
@@ -92,6 +92,8 @@ std::map augmentationMap =
{54, "gaussian_filter"},
{61, "magnitude"},
{63, "phase"},
+ {65, "bitwise_and"},
+ {68, "bitwise_or"},
{70, "copy"},
{80, "resize_mirror_normalize"},
{81, "color_jitter"},