From ac1907fbd9294b952b7a798deacab87e380d6907 Mon Sep 17 00:00:00 2001 From: Abishek <52214183+r-abishek@users.noreply.github.com> Date: Thu, 23 Sep 2021 13:50:30 -0700 Subject: [PATCH] RPP Tensor Support (#70) * add definitions for rpp tensor api * Initial commit * Initial commit - pln1/pln3 tensor testsuite * Mods for tensor test suite * Mods for brightness tensor host * arrangementParams to layoutParams * Rename to tensor_augmentations * Fix tensor host test suites * Modify host tensor support for brightness * Initial commit for tensor hip test suite * Multiple of 8 stride option * Add initial tensor support for hip * Tensor test suite support for hip pln * Fixes for GPU tensor support * Add host ROI null check * Initial commit for perf tests * Perf tests for RPP tensor support * Add gpu support for ltrb to xywh, remove roiType, fix pln3 brightness methods * Remove method1 for pln3 gpu, keep method2 * Fix hip tensor unittests * Add support for fused layout conversion on host * Add tensor unittest suite support for layout toggle * Add tensor perf tests for host - initial commit * Add tensor host test suite for perf tests * Add support for NHWC-NCHW toggle in HIP * Add test suite support for layout toggle * Reset hip unittests script * Unroll pln3 kernel * Add initial multi-bitDepth host support, remove templates * Move SSE code to macros in rpp_cpu_simd * Add support for f32 in brightness * Macro changes, Add support for f16 brightness * Add support for tensor i8 * Enable multi-bitDepth support in host perf tests * Add initial multi-bitDepth support for HIP * Add support for load24s in hip common, toggle layouts * Enable perf tests for multi-bitdepth in hip test suite * Fix bug in perf tests for tensor hip suite * Add mods to use d_float8, d_float24 and d_uint6 * Add f16 support in hip * Add f16 support in perf tests * Reduce loads and stores * Typecast to float4 mod * Modify RPPMAX2/MIN2 to std::max/min * Pass all arguments to sse macros * Reduce scope of time vars * Add omp_time_used * Change host to hip in folder name and help * Change error enums to negative * Avoid pointer or index increment by collating loads * Use variadic funcitons and pack templating to handle loads/stores * Fix i8 blank image issue in hip * Combine loads in f16/f32 and organize rpp_hip_common file * Fix I8 store issue - trials * Fix I8 store issue * Add manual typecast to float4 * Use int4 to read roiTensorPtrSrc * rppi_validate cleanup * Test suite build fix Co-authored-by: rrawther --- include/rpp.h | 1 + include/rppdefs.h | 393 ++++-- include/rppt.h | 14 + include/rppt_tensor_augmentations.h | 72 ++ src/include/cpu/rpp_cpu_common.hpp | 25 +- src/include/cpu/rpp_cpu_simd.hpp | 908 ++++++++++---- src/include/hip/rpp_hip_common.hpp | 548 ++++++++- src/modules/CMakeLists.txt | 4 +- src/modules/cpu/host_tensor_augmentations.hpp | 1068 +++++++++++++++++ src/modules/hip/hip_tensor_augmentations.hpp | 30 + src/modules/hip/kernel/brightness.hpp | 288 +++++ src/modules/hip/kernel/roi_conversion.hpp | 32 + src/modules/hip/kernel/rpp_hip_host_decls.hpp | 4 + src/modules/rppi_validate.hpp | 403 +++---- src/modules/rppt_tensor_augmentations.cpp | 182 +++ .../HIP_NEW/CMakeLists.txt | 6 + .../HIP_NEW/Tensor_hip_pkd3.cpp | 635 ++++++++++ .../HIP_NEW/Tensor_hip_pln1.cpp | 632 ++++++++++ .../HIP_NEW/Tensor_hip_pln3.cpp | 710 +++++++++++ .../HIP_NEW/generatePerformanceLogs.py | 42 +- .../HIP_NEW/rawLogsGenScript.sh | 63 +- .../HOST_NEW/CMakeLists.txt | 10 +- .../HOST_NEW/Tensor_host_pkd3.cpp | 519 ++++++++ .../HOST_NEW/Tensor_host_pln1.cpp | 517 ++++++++ .../HOST_NEW/Tensor_host_pln3.cpp | 595 +++++++++ .../HOST_NEW/generatePerformanceLogs.py | 5 +- .../HOST_NEW/rawLogsGenScript.sh | 12 + .../rpp-unittests/HIP_NEW/CMakeLists.txt | 6 + .../rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp | 831 +++++++++++++ .../rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp | 789 ++++++++++++ .../rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp | 907 ++++++++++++++ .../rpp-unittests/HIP_NEW/testAllScript.sh | 12 + .../rpp-unittests/HOST_NEW/CMakeLists.txt | 8 +- .../HOST_NEW/Tensor_host_pkd3.cpp | 710 +++++++++++ .../HOST_NEW/Tensor_host_pln1.cpp | 669 +++++++++++ .../HOST_NEW/Tensor_host_pln3.cpp | 787 ++++++++++++ .../rpp-unittests/HOST_NEW/testAllScript.sh | 14 +- .../HOST_NEW/uniqueFunctionalities_host.cpp | 113 +- 38 files changed, 11886 insertions(+), 678 deletions(-) create mode 100644 include/rppt.h create mode 100644 include/rppt_tensor_augmentations.h create mode 100644 src/modules/cpu/host_tensor_augmentations.hpp create mode 100644 src/modules/hip/hip_tensor_augmentations.hpp create mode 100644 src/modules/hip/kernel/brightness.hpp create mode 100644 src/modules/hip/kernel/roi_conversion.hpp create mode 100644 src/modules/rppt_tensor_augmentations.cpp create mode 100644 utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp create mode 100644 utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp create mode 100644 utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp create mode 100644 utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp create mode 100644 utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp create mode 100644 utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp create mode 100644 utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp create mode 100644 utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp create mode 100644 utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp create mode 100644 utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp create mode 100644 utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp create mode 100644 utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp diff --git a/include/rpp.h b/include/rpp.h index daf6ed8f3..f8d5c092c 100644 --- a/include/rpp.h +++ b/include/rpp.h @@ -49,6 +49,7 @@ extern "C" { #include "rppcore.h" #include "rppdefs.h" #include "rppi.h" +#include "rppt.h" #include "rppversion.h" diff --git a/include/rppdefs.h b/include/rppdefs.h index 9efd4d26a..7ac0e5bd2 100644 --- a/include/rppdefs.h +++ b/include/rppdefs.h @@ -2,16 +2,23 @@ MulticoreWare Inc. */ -#ifndef RPPIDEFS_H -#define RPPIDEFS_H +#ifndef RPPDEFS_H +#define RPPDEFS_H #ifdef __cplusplus extern "C" { #endif +#include #ifdef OCL_COMPILE #include #endif + + + + +/******************** RPP typedefs ********************/ + typedef unsigned char Rpp8u; typedef signed char Rpp8s; typedef unsigned short Rpp16u; @@ -22,28 +29,28 @@ typedef unsigned long long Rpp64u; typedef long long Rpp64s; typedef float Rpp32f; typedef double Rpp64f; -typedef void* RppPtr_t; +typedef void* RppPtr_t; +typedef size_t RppSize_t; typedef enum { - RPP_SUCCESS = 0, - RPP_ERROR = 1, + RPP_SUCCESS = 0, + RPP_ERROR = -1, } RppStatus; - typedef enum - { - rppStatusSuccess = 0, - rppStatusBadParm = 1, - rppStatusUnknownError = 2, - rppStatusNotInitialized = 3, - rppStatusInvalidValue = 4, - rppStatusAllocFailed = 5, - rppStatusInternalError = 6, - rppStatusNotImplemented = 7, - rppStatusUnsupportedOp = 8, +typedef enum +{ + rppStatusSuccess = 0, + rppStatusBadParm = -1, + rppStatusUnknownError = -2, + rppStatusNotInitialized = -3, + rppStatusInvalidValue = -4, + rppStatusAllocFailed = -5, + rppStatusInternalError = -6, + rppStatusNotImplemented = -7, + rppStatusUnsupportedOp = -8, } rppStatus_t; - typedef enum { RPPI_HORIZONTAL_AXIS, @@ -53,28 +60,52 @@ typedef enum typedef enum { - RPP_SCALAR_OP_AND = 1, - RPP_SCALAR_OP_OR, - RPP_SCALAR_OP_XOR, - RPP_SCALAR_OP_NAND, - RPP_SCALAR_OP_EQUAL, - RPP_SCALAR_OP_NOTEQUAL, - RPP_SCALAR_OP_LESS, - RPP_SCALAR_OP_LESSEQ, - RPP_SCALAR_OP_GREATER, - RPP_SCALAR_OP_GREATEREQ, - RPP_SCALAR_OP_ADD, - RPP_SCALAR_OP_SUBTRACT, - RPP_SCALAR_OP_MULTIPLY, - RPP_SCALAR_OP_DIVIDE, - RPP_SCALAR_OP_MODULUS, - RPP_SCALAR_OP_MIN, - RPP_SCALAR_OP_MAX, + RPP_SCALAR_OP_AND = 1, + RPP_SCALAR_OP_OR, + RPP_SCALAR_OP_XOR, + RPP_SCALAR_OP_NAND, + RPP_SCALAR_OP_EQUAL, + RPP_SCALAR_OP_NOTEQUAL, + RPP_SCALAR_OP_LESS, + RPP_SCALAR_OP_LESSEQ, + RPP_SCALAR_OP_GREATER, + RPP_SCALAR_OP_GREATEREQ, + RPP_SCALAR_OP_ADD, + RPP_SCALAR_OP_SUBTRACT, + RPP_SCALAR_OP_MULTIPLY, + RPP_SCALAR_OP_DIVIDE, + RPP_SCALAR_OP_MODULUS, + RPP_SCALAR_OP_MIN, + RPP_SCALAR_OP_MAX, } RppOp; typedef enum { - RGB_HSV = 1, + U8_S8, + S8_U8, +} RppConvertBitDepthMode; + +typedef struct +{ + Rpp32f rho; + Rpp32f theta; +} RppPointPolar; + +typedef struct +{ + Rpp32u channelParam; + Rpp32u bufferMultiplier; +} RppLayoutParams; + + + + + +/******************** RPPI typedefs ********************/ + +typedef enum +{ + RGB_HSV = 1, HSV_RGB } RppiColorConvertMode; @@ -91,33 +122,36 @@ typedef enum RPPI_CHN_PACKED } RppiChnFormat; -typedef struct { +typedef struct +{ unsigned int width; unsigned int height; - } RppiSize; +} RppiSize; -typedef struct{ - int x; - int y; +typedef struct +{ + int x; + int y; } RppiPoint; typedef struct - { - int x; - int y; - int width; - int height; - } RppiRect; +{ + int x; + int y; + int width; + int height; +} RppiRect; -// roiHeight & roiWidth needs to be changed to xend & yend -typedef struct { +typedef struct +{ unsigned int x; unsigned int y; unsigned int roiWidth; unsigned int roiHeight; - } RppiROI; +} RppiROI; -typedef enum{ +typedef enum +{ GAUSS3, GAUSS5, GAUSS3x1, @@ -126,62 +160,132 @@ typedef enum{ AVG5 } RppiBlur; -typedef enum{ +typedef enum +{ ZEROPAD, NOPAD } RppiPad; -typedef enum{ - U8_S8, - S8_U8, -} RppConvertBitDepthMode; - -typedef enum{ +typedef enum +{ RGB, HSV } RppiFormat; -typedef struct { - Rpp32f rho; - Rpp32f theta; - } RppPointPolar; -typedef struct{ + + + +/******************** RPPT typedefs ********************/ + +typedef enum +{ + U8, + F32, + F16, + I8 +} RpptDataType; + +typedef enum +{ + NCHW, + NHWC +} RpptLayout; + +typedef enum +{ + LTRB, + XYWH + +} RpptRoiType; + +typedef struct +{ + RppiPoint lt, rb; + +} RpptRoiLtrb; + +typedef struct +{ + RppiPoint xy; + int roiWidth, roiHeight; + +} RpptRoiXywh; + +typedef union +{ + RpptRoiLtrb ltrbROI; + RpptRoiXywh xywhROI; + +} RpptROI, *RpptROIPtr; + +typedef struct +{ + Rpp32u nStride; + Rpp32u cStride; + Rpp32u hStride; + Rpp32u wStride; +} RpptStrides; + +typedef struct +{ + RppSize_t numDims; + Rpp32u offset; + RpptDataType dataType; + RpptLayout layout; + Rpp32u n, c, h, w; + RpptStrides strides; +} RpptDesc, *RpptDescPtr; + + + + + +/******************** HOST memory typedefs ********************/ + +typedef struct +{ Rpp64f *doublemem; -}memRpp64f; +} memRpp64f; -typedef struct{ +typedef struct +{ Rpp32f *floatmem; -}memRpp32f; +} memRpp32f; -typedef struct{ +typedef struct +{ Rpp32u *uintmem; -}memRpp32u; +} memRpp32u; -typedef struct{ +typedef struct +{ Rpp32s *intmem; -}memRpp32s; +} memRpp32s; -typedef struct{ +typedef struct +{ Rpp8u *ucharmem; -}memRpp8u; +} memRpp8u; -typedef struct{ +typedef struct +{ Rpp8s *charmem; -}memRpp8s; +} memRpp8s; -typedef struct{ +typedef struct +{ Rpp32u *height; Rpp32u *width; -}memSize; +} memSize; -// roiHeight & roiWidth needs to be changed to xend & yend -typedef struct{ +typedef struct +{ Rpp32u *x; Rpp32u *y; Rpp32u *roiHeight; Rpp32u *roiWidth; -}memRoi; +} memROI; typedef struct { RppiSize *srcSize; @@ -199,57 +303,73 @@ typedef struct { Rpp64u *dstBatchIndex; Rpp32u *inc; Rpp32u *dstInc; -}memCPU; +} memCPU; + + + + #ifdef OCL_COMPILE -typedef struct{ +/******************** OCL memory typedefs ********************/ + +typedef struct +{ cl_mem floatmem; -}clmemRpp32f; +} clmemRpp32f; -typedef struct{ +typedef struct +{ cl_mem doublemem; -}clmemRpp64f; +} clmemRpp64f; -typedef struct{ +typedef struct +{ cl_mem uintmem; -}clmemRpp32u; +} clmemRpp32u; -typedef struct{ +typedef struct +{ cl_mem intmem; -}clmemRpp32s; +} clmemRpp32s; -typedef struct{ +typedef struct +{ cl_mem ucharmem; -}clmemRpp8u; +} clmemRpp8u; -typedef struct{ +typedef struct +{ cl_mem charmem; -}clmemRpp8s; +} clmemRpp8s; -typedef struct{ +typedef struct +{ cl_mem height; cl_mem width; -}clmemSize; +} clmemSize; -typedef struct{ +typedef struct +{ cl_mem x; cl_mem y; cl_mem roiHeight; cl_mem roiWidth; -}clmemRoi; -typedef struct{ +} clmemROI; + +typedef struct +{ memSize csrcSize; memSize cdstSize; memSize cmaxSrcSize; memSize cmaxDstSize; - memRoi croiPoints; + memROI croiPoints; clmemSize srcSize; clmemSize dstSize; clmemSize maxSrcSize; clmemSize maxDstSize; - clmemRoi roiPoints; + clmemROI roiPoints; clmemRpp32f floatArr[10]; clmemRpp64f doubleArr[10]; clmemRpp32u uintArr[10]; @@ -263,54 +383,69 @@ typedef struct{ } memGPU; -#else -typedef struct{ + + + +#elif defined(HIP_COMPILE) + +/******************** HIP memory typedefs ********************/ + +typedef struct +{ Rpp32f* floatmem; -}hipMemRpp32f; +} hipMemRpp32f; -typedef struct{ +typedef struct +{ Rpp64f* doublemem; -}hipMemRpp64f; +} hipMemRpp64f; -typedef struct{ +typedef struct +{ Rpp32u* uintmem; -}hipMemRpp32u; +} hipMemRpp32u; -typedef struct{ +typedef struct +{ Rpp32s* intmem; -}hipMemRpp32s; +} hipMemRpp32s; -typedef struct{ +typedef struct +{ Rpp8u* ucharmem; -}hipMemRpp8u; +} hipMemRpp8u; -typedef struct{ +typedef struct +{ Rpp8s* charmem; -}hipMemRpp8s; +} hipMemRpp8s; -typedef struct{ +typedef struct +{ Rpp32u* height; Rpp32u* width; -}hipMemSize; +} hipMemSize; -// roiHeight & roiWidth needs to be changed to xend & yend -typedef struct{ +typedef struct +{ Rpp32u* x; Rpp32u* y; Rpp32u* roiHeight; Rpp32u* roiWidth; -}hipMemRoi; -typedef struct{ +} hipMemROI; + +typedef struct +{ memSize csrcSize; memSize cdstSize; memSize cmaxSrcSize; memSize cmaxDstSize; - memRoi croiPoints; + memROI croiPoints; hipMemSize srcSize; hipMemSize dstSize; hipMemSize maxSrcSize; hipMemSize maxDstSize; - hipMemRoi roiPoints; + hipMemROI roiPoints; hipMemRpp32f floatArr[10]; hipMemRpp64f doubleArr[10]; hipMemRpp32u uintArr[10]; @@ -323,14 +458,22 @@ typedef struct{ Rpp32u* dstInc; } memGPU; -#endif +#endif //BACKEND -typedef struct{ - memCPU mcpu; - memGPU mgpu; + + + + +/******************** Memory management and handle typedefs ********************/ + +typedef struct +{ + memCPU mcpu; + memGPU mgpu; } memMgmt; -typedef struct{ +typedef struct +{ RppPtr_t cpuHandle; Rpp32u nbatchSize; memMgmt mem; @@ -344,4 +487,4 @@ typedef struct{ #ifdef __cplusplus } #endif -#endif /* RPPIDEFS_H */ +#endif /* RPPDEFS_H */ diff --git a/include/rppt.h b/include/rppt.h new file mode 100644 index 000000000..3f1b54686 --- /dev/null +++ b/include/rppt.h @@ -0,0 +1,14 @@ +#ifndef RPPT_H +#define RPPT_H + +#include "rpp.h" +#ifdef __cplusplus +extern "C" { +#endif + +#include "rppt_tensor_augmentations.h" + +#ifdef __cplusplus +} +#endif +#endif /* RPPT_H */ diff --git a/include/rppt_tensor_augmentations.h b/include/rppt_tensor_augmentations.h new file mode 100644 index 000000000..31678594a --- /dev/null +++ b/include/rppt_tensor_augmentations.h @@ -0,0 +1,72 @@ +/* +Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef RPPT_TENSOR_AUGMENTATIONS_H +#define RPPT_TENSOR_AUGMENTATIONS_H +#include "rpp.h" +#include "rppdefs.h" +#ifdef __cplusplus +extern "C" { +#endif + +// ---------------------------------------- +// CPU brightness functions declaration +// ---------------------------------------- +/* Brightness augmentation for a NCHW/NHWC layout tensor +*param[in] srcPtr source tensor memory +*param[in] srcDesc source tensor descriptor +*param[out] dstPtr destination tensor memory +*param[in] dstDesc destination tensor descriptor +*param[in] alphaTensor alpha values for brightness calculation (1D tensor of size batchSize with 0 <= alpha <= 20 for each image in batch) +*param[in] betaTensor beta values for brightness calculation (1D tensor of size batchSize with 0 <= beta <= 255 for each image in batch) +*param[in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) +*param[in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) +*returns a RppStatus enumeration. +*retval RPP_SUCCESS : succesful completion +*retval RPP_ERROR : Error +*/ +RppStatus +rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, Rpp32f *betaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +// ---------------------------------------- +// GPU brightness functions declaration +// ---------------------------------------- +/* Brightness augmentation for a NCHW/NHWC layout tensor +*param[in] srcPtr source tensor memory +*param[in] srcDesc source tensor descriptor +*param[out] dstPtr destination tensor memory +*param[in] dstDesc destination tensor descriptor +*param[in] alphaTensor alpha values for brightness calculation (1D tensor of size batchSize with 0 <= alpha <= 20 for each image in batch) +*param[in] betaTensor beta values for brightness calculation (1D tensor of size batchSize with 0 <= beta <= 255 for each image in batch) +*param[in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y)) +*param[in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB) +*returns a RppStatus enumeration. +*retval RPP_SUCCESS : succesful completion +*retval RPP_ERROR : Error +*/ +RppStatus +rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, Rpp32f *betaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); + +#ifdef __cplusplus +} +#endif +#endif \ No newline at end of file diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index f320757b3..66a3d07f9 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -8,8 +8,8 @@ #include #include #include -using half_float::half; -typedef half Rpp16f; +using halfhpp = half_float::half; +typedef halfhpp Rpp16f; #include "rpp_cpu_simd.hpp" #define PI 3.14159265 @@ -25,6 +25,7 @@ typedef half Rpp16f; #define RPPCEIL(a) ((int) (a + 1.0)) #define RPPISEVEN(a) ((a % 2 == 0) ? 1 : 0) #define RPPPIXELCHECK(pixel) (pixel < (Rpp32f) 0) ? ((Rpp32f) 0) : ((pixel < (Rpp32f) 255) ? pixel : ((Rpp32f) 255)) +#define RPPPIXELCHECKF32(pixel) (pixel < (Rpp32f) 0) ? ((Rpp32f) 0) : ((pixel < (Rpp32f) 1) ? pixel : ((Rpp32f) 1)) #define RPPPIXELCHECKI8(pixel) (pixel < (Rpp32f) -128) ? ((Rpp32f) -128) : ((pixel < (Rpp32f) 127) ? pixel : ((Rpp32f) 127)) #define RPPISGREATER(pixel, value) ((pixel > value) ? 1 : 0) #define RPPISLESSER(pixel, value) ((pixel < value) ? 1 : 0) @@ -2078,6 +2079,26 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP // Compute Functions +inline RppStatus compute_xywh_from_ltrb_host(RpptROIPtr roiPtrInput, RpptROIPtr roiPtrImage) +{ + roiPtrImage->xywhROI.xy.x = roiPtrInput->ltrbROI.lt.x; + roiPtrImage->xywhROI.xy.y = roiPtrInput->ltrbROI.lt.y; + roiPtrImage->xywhROI.roiWidth = roiPtrInput->ltrbROI.rb.x - roiPtrInput->ltrbROI.lt.x + 1; + roiPtrImage->xywhROI.roiHeight = roiPtrInput->ltrbROI.rb.y - roiPtrInput->ltrbROI.lt.y + 1; + + return RPP_SUCCESS; +} + +inline RppStatus compute_roi_boundary_check_host(RpptROIPtr roiPtrImage, RpptROIPtr roiPtr, RpptROIPtr roiPtrDefault) +{ + roiPtr->xywhROI.xy.x = std::max(roiPtrDefault->xywhROI.xy.x, roiPtrImage->xywhROI.xy.x); + roiPtr->xywhROI.xy.y = std::max(roiPtrDefault->xywhROI.xy.y, roiPtrImage->xywhROI.xy.y); + roiPtr->xywhROI.roiWidth = std::min(roiPtrDefault->xywhROI.roiWidth - roiPtrImage->xywhROI.xy.x, roiPtrImage->xywhROI.roiWidth); + roiPtr->xywhROI.roiHeight = std::min(roiPtrDefault->xywhROI.roiHeight - roiPtrImage->xywhROI.xy.y, roiPtrImage->xywhROI.roiHeight); + + return RPP_SUCCESS; +} + template inline RppStatus compute_subimage_location_host(T* ptr, T** ptrSubImage, RppiSize size, RppiSize *sizeSubImage, diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp index 0209e7512..054ae1194 100644 --- a/src/include/cpu/rpp_cpu_simd.hpp +++ b/src/include/cpu/rpp_cpu_simd.hpp @@ -1,6 +1,6 @@ #ifndef AMD_RPP_RPP_CPU_SIMD_HPP #define AMD_RPP_RPP_CPU_SIMD_HPP -#if 1 + #if _WIN32 #include #else @@ -10,45 +10,491 @@ #endif #define __AVX2__ 1 +#define __SSE4_1__ 1 #define M256I(m256i_register) (*((_m256i_union*)&m256i_register)) -typedef union { - char m256i_i8[32]; - short m256i_i16[16]; - int m256i_i32[8]; - long long m256i_i64[4]; - __m128i m256i_i128[2]; -}_m256i_union; +typedef union +{ + char m256i_i8[32]; + short m256i_i16[16]; + int m256i_i32[8]; + long long m256i_i64[4]; + __m128i m256i_i128[2]; +} _m256i_union; #if defined(_MSC_VER) #define SIMD_ALIGN_VAR(type, name, alignment) \ - __declspec(align(alignment)) type name + __declspec(align(alignment)) type name #else #define SIMD_ALIGN_VAR(type, name, alignment) \ - type __attribute__((__aligned__(alignment))) name + type __attribute__((__aligned__(alignment))) name #endif // _MSC_VER #define SIMD_CONST_PI(name, val0, val1, val2, val3) \ - SIMD_ALIGN_VAR(static const int, _xmm_const_##name[4], 16) = { \ - static_cast(val3), \ - static_cast(val2), \ - static_cast(val1), \ - static_cast(val0) \ - } + SIMD_ALIGN_VAR(static const int, _xmm_const_##name[4], 16) = { \ + static_cast(val3), \ + static_cast(val2), \ + static_cast(val1), \ + static_cast(val0) \ + } #define SIMD_CONST_PS(name, val0, val1, val2, val3) \ - SIMD_ALIGN_VAR(static const float, _xmm_const_##name[4], 16) = { \ - static_cast(val3), \ - static_cast(val2), \ - static_cast(val1), \ - static_cast(val0) \ - } + SIMD_ALIGN_VAR(static const float, _xmm_const_##name[4], 16) = { \ + static_cast(val3), \ + static_cast(val2), \ + static_cast(val1), \ + static_cast(val0) \ + } #define SIMD_GET_PS(name) (*(const __m128 *)_xmm_const_##name) +inline RppStatus rpp_load48_u8pkd3_to_f32pln3(Rpp8u *srcPtr, __m128 *p) +{ + __m128i px[8]; + __m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15); + __m128i pxZero = _mm_setzero_si128(); + + px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ + px[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ + px[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24)); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 \ */ + px[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36)); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 \ */ + px[0] = _mm_shuffle_epi8(px[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ + px[1] = _mm_shuffle_epi8(px[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ + px[2] = _mm_shuffle_epi8(px[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ + px[3] = _mm_shuffle_epi8(px[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ + px[4] = _mm_unpackhi_epi8(px[0], pxZero); /* unpack 8 hi-pixels of px[0] */ + px[5] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] */ + px[6] = _mm_unpackhi_epi8(px[2], pxZero); /* unpack 8 hi-pixels of px[2] */ + px[7] = _mm_unpackhi_epi8(px[3], pxZero); /* unpack 8 hi-pixels of px[3] */ + px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* unpack 8 lo-pixels of px[0] */ + px[1] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] */ + px[2] = _mm_unpacklo_epi8(px[2], pxZero); /* unpack 8 lo-pixels of px[2] */ + px[3] = _mm_unpacklo_epi8(px[3], pxZero); /* unpack 8 lo-pixels of px[3] */ + p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero)); /* unpack 4 lo-pixels of px[0] - Contains R01-04 */ + p[1] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero)); /* unpack 4 lo-pixels of px[1] - Contains R05-08 */ + p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero)); /* unpack 4 lo-pixels of px[2] - Contains R09-12 */ + p[3] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero)); /* unpack 4 lo-pixels of px[3] - Contains R13-16 */ + p[4] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero)); /* unpack 4 hi-pixels of px[0] - Contains G01-04 */ + p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero)); /* unpack 4 hi-pixels of px[1] - Contains G05-08 */ + p[6] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero)); /* unpack 4 hi-pixels of px[2] - Contains G09-12 */ + p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero)); /* unpack 4 hi-pixels of px[3] - Contains G13-16 */ + p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero)); /* unpack 4 lo-pixels of px[4] - Contains B01-04 */ + p[9] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero)); /* unpack 4 lo-pixels of px[5] - Contains B05-08 */ + p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[6], pxZero)); /* unpack 4 lo-pixels of px[6] - Contains B09-12 */ + p[11] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[7], pxZero)); /* unpack 4 lo-pixels of px[7] - Contains B13-16 */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store48_f32pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128 *p) +{ + __m128i px[8]; + + px[4] = _mm_cvtps_epi32(p[0]); /* convert to int32 for R */ + px[5] = _mm_cvtps_epi32(p[1]); /* convert to int32 for R */ + px[6] = _mm_cvtps_epi32(p[2]); /* convert to int32 for R */ + px[7] = _mm_cvtps_epi32(p[3]); /* convert to int32 for R */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 for R */ + px[5] = _mm_packus_epi32(px[6], px[7]); /* pack pixels 8-15 for R */ + px[0] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 for R */ + px[4] = _mm_cvtps_epi32(p[4]); /* convert to int32 for G */ + px[5] = _mm_cvtps_epi32(p[5]); /* convert to int32 for G */ + px[6] = _mm_cvtps_epi32(p[6]); /* convert to int32 for G */ + px[7] = _mm_cvtps_epi32(p[7]); /* convert to int32 for G */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 for G */ + px[5] = _mm_packus_epi32(px[6], px[7]); /* pack pixels 8-15 for G */ + px[1] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 for G */ + px[4] = _mm_cvtps_epi32(p[8]); /* convert to int32 for B */ + px[5] = _mm_cvtps_epi32(p[9]); /* convert to int32 for B */ + px[6] = _mm_cvtps_epi32(p[10]); /* convert to int32 for B */ + px[7] = _mm_cvtps_epi32(p[11]); /* convert to int32 for B */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 for B */ + px[5] = _mm_packus_epi32(px[6], px[7]); /* pack pixels 8-15 for B */ + px[2] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 for B */ + _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + _mm_storeu_si128((__m128i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + _mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load48_u8pln3_to_f32pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128 *p) +{ + __m128i px[6]; + __m128i pxZero = _mm_setzero_si128(); + + px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm_loadu_si128((__m128i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + px[3] = _mm_unpackhi_epi8(px[0], pxZero); /* unpack 8 hi-pixels of px[0] */ + px[4] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] */ + px[5] = _mm_unpackhi_epi8(px[2], pxZero); /* unpack 8 hi-pixels of px[2] */ + px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* unpack 8 lo-pixels of px[0] */ + px[1] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] */ + px[2] = _mm_unpacklo_epi8(px[2], pxZero); /* unpack 8 lo-pixels of px[2] */ + p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero)); /* pixels 0-3 of original px[0] containing 16 R values */ + p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero)); /* pixels 4-7 of original px[0] containing 16 R values */ + p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero)); /* pixels 8-11 of original px[0] containing 16 R values */ + p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero)); /* pixels 12-15 of original px[0] containing 16 R values */ + p[4] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero)); /* pixels 0-3 of original px[1] containing 16 G values */ + p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero)); /* pixels 4-7 of original px[1] containing 16 G values */ + p[6] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero)); /* pixels 8-11 of original px[1] containing 16 G values */ + p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[4], pxZero)); /* pixels 12-15 of original px[1] containing 16 G values */ + p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero)); /* pixels 0-3 of original px[1] containing 16 B values */ + p[9] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero)); /* pixels 4-7 of original px[1] containing 16 B values */ + p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero)); /* pixels 8-11 of original px[1] containing 16 B values */ + p[11] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[5], pxZero)); /* pixels 12-15 of original px[1] containing 16 B values */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store48_f32pln3_to_u8pkd3(Rpp8u *dstPtr, __m128 *p) +{ + __m128i px[7]; + __m128i pxMask = _mm_setr_epi8(0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 13, 14, 15); + __m128 pZero = _mm_setzero_ps(); + + px[4] = _mm_cvtps_epi32(p[0]); /* convert to int32 for R01-04 */ + px[5] = _mm_cvtps_epi32(p[4]); /* convert to int32 for G01-04 */ + px[6] = _mm_cvtps_epi32(p[8]); /* convert to int32 for B01-04 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R01-04|G01-04 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B01-04|X01-04 */ + px[0] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R01|R02|R03|R04|G01|G02|G03|G04|B01|B02|B03|B04|00|00|00|00] */ + px[4] = _mm_cvtps_epi32(p[1]); /* convert to int32 for R05-08 */ + px[5] = _mm_cvtps_epi32(p[5]); /* convert to int32 for G05-08 */ + px[6] = _mm_cvtps_epi32(p[9]); /* convert to int32 for B05-08 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R05-08|G05-08 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B05-08|X01-04 */ + px[1] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R05|R06|R07|R08|G05|G06|G07|G08|B05|B06|B07|B08|00|00|00|00] */ + px[4] = _mm_cvtps_epi32(p[2]); /* convert to int32 for R09-12 */ + px[5] = _mm_cvtps_epi32(p[6]); /* convert to int32 for G09-12 */ + px[6] = _mm_cvtps_epi32(p[10]); /* convert to int32 for B09-12 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R09-12|G09-12 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B09-12|X01-04 */ + px[2] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R09|R10|R11|R12|G09|G10|G11|G12|B09|B10|B11|B12|00|00|00|00] */ + px[4] = _mm_cvtps_epi32(p[3]); /* convert to int32 for R13-16 */ + px[5] = _mm_cvtps_epi32(p[7]); /* convert to int32 for G13-16 */ + px[6] = _mm_cvtps_epi32(p[11]); /* convert to int32 for B13-16 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R13-16|G13-16 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B13-16|X01-04 */ + px[3] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R13|R14|R15|R16|G13|G14|G15|G16|B13|B14|B15|B16|00|00|00|00] */ + px[0] = _mm_shuffle_epi8(px[0], pxMask); /* shuffle to get [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + px[1] = _mm_shuffle_epi8(px[1], pxMask); /* shuffle to get [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + px[2] = _mm_shuffle_epi8(px[2], pxMask); /* shuffle to get [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + px[3] = _mm_shuffle_epi8(px[3], pxMask); /* shuffle to get [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + _mm_storeu_si128((__m128i *)dstPtr, px[0]); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), px[1]); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), px[2]); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p) +{ + __m128i px[2]; + __m128i pxZero = _mm_setzero_si128(); + + px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ + px[1] = _mm_unpackhi_epi8(px[0], pxZero); /* pixels 8-15 */ + px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* pixels 0-7 */ + p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero)); /* pixels 0-3 */ + p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero)); /* pixels 4-7 */ + p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero)); /* pixels 8-11 */ + p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero)); /* pixels 12-15 */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store16_f32_to_u8(Rpp8u *dstPtr, __m128 *p) +{ + __m128i px[4]; + + px[0] = _mm_cvtps_epi32(p[0]); /* pixels 0-3 */ + px[1] = _mm_cvtps_epi32(p[1]); /* pixels 4-7 */ + px[2] = _mm_cvtps_epi32(p[2]); /* pixels 8-11 */ + px[3] = _mm_cvtps_epi32(p[3]); /* pixels 12-15 */ + px[0] = _mm_packus_epi32(px[0], px[1]); /* pixels 0-7 */ + px[1] = _mm_packus_epi32(px[2], px[3]); /* pixels 8-15 */ + px[0] = _mm_packus_epi16(px[0], px[1]); /* pixels 0-15 */ + _mm_storeu_si128((__m128i *)dstPtr, px[0]); /* store pixels 0-15 */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load12_f32pkd3_to_f32pln3(Rpp32f *srcPtr, __m128 *p) +{ + p[0] = _mm_loadu_ps(srcPtr); + p[1] = _mm_loadu_ps(srcPtr + 3); + p[2] = _mm_loadu_ps(srcPtr + 6); + p[3] = _mm_loadu_ps(srcPtr + 9); + _MM_TRANSPOSE4_PS(p[0], p[1], p[2], p[3]); + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store12_f32pln3_to_f32pln3(Rpp32f *dstPtrR, Rpp32f *dstPtrG, Rpp32f *dstPtrB, __m128 *p) +{ + _mm_storeu_ps(dstPtrR, p[0]); + _mm_storeu_ps(dstPtrG, p[1]); + _mm_storeu_ps(dstPtrB, p[2]); + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load12_f32pln3_to_f32pln3(Rpp32f *srcPtrR, Rpp32f *srcPtrG, Rpp32f *srcPtrB, __m128 *p) +{ + p[0] = _mm_loadu_ps(srcPtrR); + p[1] = _mm_loadu_ps(srcPtrG); + p[2] = _mm_loadu_ps(srcPtrB); + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store12_f32pln3_to_f32pkd3(Rpp32f *dstPtr, __m128 *p) +{ + _MM_TRANSPOSE4_PS(p[0], p[1], p[2], p[3]); + _mm_storeu_ps(dstPtr, p[0]); + _mm_storeu_ps(dstPtr + 3, p[1]); + _mm_storeu_ps(dstPtr + 6, p[2]); + _mm_storeu_ps(dstPtr + 9, p[3]); + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load4_f32_to_f32(Rpp32f *srcPtr, __m128 *p) +{ + p[0] = _mm_loadu_ps(srcPtr); + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store4_f32_to_f32(Rpp32f *dstPtr, __m128 *p) +{ + _mm_storeu_ps(dstPtr, p[0]); + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load48_i8pkd3_to_f32pln3(Rpp8s *srcPtr, __m128 *p) +{ + __m128i px[8]; + __m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15); + __m128i pxZero = _mm_setzero_si128(); + __m128i pxConvertI8 = _mm_set1_epi8((char)128); + + px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */ + px[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12)); /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */ + px[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24)); /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 \ */ + px[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36)); /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 \ */ + px[0] = _mm_shuffle_epi8(px[0], pxMask); /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */ + px[1] = _mm_shuffle_epi8(px[1], pxMask); /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */ + px[2] = _mm_shuffle_epi8(px[2], pxMask); /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */ + px[3] = _mm_shuffle_epi8(px[3], pxMask); /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */ + px[0] = _mm_add_epi8(px[0], pxConvertI8); /* convert to u8 for px0 compute */ + px[1] = _mm_add_epi8(px[1], pxConvertI8); /* convert to u8 for px1 compute */ + px[2] = _mm_add_epi8(px[2], pxConvertI8); /* convert to u8 for px2 compute */ + px[3] = _mm_add_epi8(px[3], pxConvertI8); /* convert to u8 for px3 compute */ + px[4] = _mm_unpackhi_epi8(px[0], pxZero); /* unpack 8 hi-pixels of px[0] */ + px[5] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] */ + px[6] = _mm_unpackhi_epi8(px[2], pxZero); /* unpack 8 hi-pixels of px[2] */ + px[7] = _mm_unpackhi_epi8(px[3], pxZero); /* unpack 8 hi-pixels of px[3] */ + px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* unpack 8 lo-pixels of px[0] */ + px[1] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] */ + px[2] = _mm_unpacklo_epi8(px[2], pxZero); /* unpack 8 lo-pixels of px[2] */ + px[3] = _mm_unpacklo_epi8(px[3], pxZero); /* unpack 8 lo-pixels of px[3] */ + p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero)); /* unpack 4 lo-pixels of px[0] - Contains R01-04 */ + p[1] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero)); /* unpack 4 lo-pixels of px[1] - Contains R05-08 */ + p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero)); /* unpack 4 lo-pixels of px[2] - Contains R09-12 */ + p[3] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero)); /* unpack 4 lo-pixels of px[3] - Contains R13-16 */ + p[4] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero)); /* unpack 4 hi-pixels of px[0] - Contains G01-04 */ + p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero)); /* unpack 4 hi-pixels of px[1] - Contains G05-08 */ + p[6] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero)); /* unpack 4 hi-pixels of px[2] - Contains G09-12 */ + p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero)); /* unpack 4 hi-pixels of px[3] - Contains G13-16 */ + p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero)); /* unpack 4 lo-pixels of px[4] - Contains B01-04 */ + p[9] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero)); /* unpack 4 lo-pixels of px[5] - Contains B05-08 */ + p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[6], pxZero)); /* unpack 4 lo-pixels of px[6] - Contains B09-12 */ + p[11] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[7], pxZero)); /* unpack 4 lo-pixels of px[7] - Contains B13-16 */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store48_f32pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128 *p) +{ + __m128i px[8]; + __m128i pxConvertI8 = _mm_set1_epi8((char)128); + + px[4] = _mm_cvtps_epi32(p[0]); /* convert to int32 for R */ + px[5] = _mm_cvtps_epi32(p[1]); /* convert to int32 for R */ + px[6] = _mm_cvtps_epi32(p[2]); /* convert to int32 for R */ + px[7] = _mm_cvtps_epi32(p[3]); /* convert to int32 for R */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 for R */ + px[5] = _mm_packus_epi32(px[6], px[7]); /* pack pixels 8-15 for R */ + px[0] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 for R */ + px[4] = _mm_cvtps_epi32(p[4]); /* convert to int32 for G */ + px[5] = _mm_cvtps_epi32(p[5]); /* convert to int32 for G */ + px[6] = _mm_cvtps_epi32(p[6]); /* convert to int32 for G */ + px[7] = _mm_cvtps_epi32(p[7]); /* convert to int32 for G */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 for G */ + px[5] = _mm_packus_epi32(px[6], px[7]); /* pack pixels 8-15 for G */ + px[1] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 for G */ + px[4] = _mm_cvtps_epi32(p[8]); /* convert to int32 for B */ + px[5] = _mm_cvtps_epi32(p[9]); /* convert to int32 for B */ + px[6] = _mm_cvtps_epi32(p[10]); /* convert to int32 for B */ + px[7] = _mm_cvtps_epi32(p[11]); /* convert to int32 for B */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 for B */ + px[5] = _mm_packus_epi32(px[6], px[7]); /* pack pixels 8-15 for B */ + px[2] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 for B */ + px[0] = _mm_sub_epi8(px[0], pxConvertI8); /* convert back to i8 for px0 store */ + px[1] = _mm_sub_epi8(px[1], pxConvertI8); /* convert back to i8 for px1 store */ + px[2] = _mm_sub_epi8(px[2], pxConvertI8); /* convert back to i8 for px2 store */ + _mm_storeu_si128((__m128i *)dstPtrR, px[0]); /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + _mm_storeu_si128((__m128i *)dstPtrG, px[1]); /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + _mm_storeu_si128((__m128i *)dstPtrB, px[2]); /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load48_i8pln3_to_f32pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m128 *p) +{ + __m128i px[6]; + __m128i pxZero = _mm_setzero_si128(); + __m128i pxConvertI8 = _mm_set1_epi8((char)128); + + px[0] = _mm_loadu_si128((__m128i *)srcPtrR); /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */ + px[1] = _mm_loadu_si128((__m128i *)srcPtrG); /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */ + px[2] = _mm_loadu_si128((__m128i *)srcPtrB); /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */ + px[0] = _mm_add_epi8(px[0], pxConvertI8); /* convert to u8 for px0 compute */ + px[1] = _mm_add_epi8(px[1], pxConvertI8); /* convert to u8 for px1 compute */ + px[2] = _mm_add_epi8(px[2], pxConvertI8); /* convert to u8 for px2 compute */ + px[3] = _mm_unpackhi_epi8(px[0], pxZero); /* unpack 8 hi-pixels of px[0] */ + px[4] = _mm_unpackhi_epi8(px[1], pxZero); /* unpack 8 hi-pixels of px[1] */ + px[5] = _mm_unpackhi_epi8(px[2], pxZero); /* unpack 8 hi-pixels of px[2] */ + px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* unpack 8 lo-pixels of px[0] */ + px[1] = _mm_unpacklo_epi8(px[1], pxZero); /* unpack 8 lo-pixels of px[1] */ + px[2] = _mm_unpacklo_epi8(px[2], pxZero); /* unpack 8 lo-pixels of px[2] */ + p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero)); /* pixels 0-3 of original px[0] containing 16 R values */ + p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero)); /* pixels 4-7 of original px[0] containing 16 R values */ + p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero)); /* pixels 8-11 of original px[0] containing 16 R values */ + p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero)); /* pixels 12-15 of original px[0] containing 16 R values */ + p[4] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero)); /* pixels 0-3 of original px[1] containing 16 G values */ + p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero)); /* pixels 4-7 of original px[1] containing 16 G values */ + p[6] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero)); /* pixels 8-11 of original px[1] containing 16 G values */ + p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[4], pxZero)); /* pixels 12-15 of original px[1] containing 16 G values */ + p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero)); /* pixels 0-3 of original px[1] containing 16 B values */ + p[9] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero)); /* pixels 4-7 of original px[1] containing 16 B values */ + p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero)); /* pixels 8-11 of original px[1] containing 16 B values */ + p[11] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[5], pxZero)); /* pixels 12-15 of original px[1] containing 16 B values */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store48_f32pln3_to_i8pkd3(Rpp8s *dstPtr, __m128 *p) +{ + __m128i px[7]; + __m128i pxMask = _mm_setr_epi8(0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 13, 14, 15); + __m128i pxConvertI8 = _mm_set1_epi8((char)128); + __m128 pZero = _mm_setzero_ps(); + + px[4] = _mm_cvtps_epi32(p[0]); /* convert to int32 for R01-04 */ + px[5] = _mm_cvtps_epi32(p[4]); /* convert to int32 for G01-04 */ + px[6] = _mm_cvtps_epi32(p[8]); /* convert to int32 for B01-04 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R01-04|G01-04 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B01-04|X01-04 */ + px[0] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R01|R02|R03|R04|G01|G02|G03|G04|B01|B02|B03|B04|00|00|00|00] */ + px[4] = _mm_cvtps_epi32(p[1]); /* convert to int32 for R05-08 */ + px[5] = _mm_cvtps_epi32(p[5]); /* convert to int32 for G05-08 */ + px[6] = _mm_cvtps_epi32(p[9]); /* convert to int32 for B05-08 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R05-08|G05-08 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B05-08|X01-04 */ + px[1] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R05|R06|R07|R08|G05|G06|G07|G08|B05|B06|B07|B08|00|00|00|00] */ + px[4] = _mm_cvtps_epi32(p[2]); /* convert to int32 for R09-12 */ + px[5] = _mm_cvtps_epi32(p[6]); /* convert to int32 for G09-12 */ + px[6] = _mm_cvtps_epi32(p[10]); /* convert to int32 for B09-12 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R09-12|G09-12 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B09-12|X01-04 */ + px[2] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R09|R10|R11|R12|G09|G10|G11|G12|B09|B10|B11|B12|00|00|00|00] */ + px[4] = _mm_cvtps_epi32(p[3]); /* convert to int32 for R13-16 */ + px[5] = _mm_cvtps_epi32(p[7]); /* convert to int32 for G13-16 */ + px[6] = _mm_cvtps_epi32(p[11]); /* convert to int32 for B13-16 */ + px[4] = _mm_packus_epi32(px[4], px[5]); /* pack pixels 0-7 as R13-16|G13-16 */ + px[5] = _mm_packus_epi32(px[6], pZero); /* pack pixels 8-15 as B13-16|X01-04 */ + px[3] = _mm_packus_epi16(px[4], px[5]); /* pack pixels 0-15 as [R13|R14|R15|R16|G13|G14|G15|G16|B13|B14|B15|B16|00|00|00|00] */ + px[0] = _mm_sub_epi8(px[0], pxConvertI8); /* convert back to i8 for px0 store */ + px[1] = _mm_sub_epi8(px[1], pxConvertI8); /* convert back to i8 for px1 store */ + px[2] = _mm_sub_epi8(px[2], pxConvertI8); /* convert back to i8 for px2 store */ + px[3] = _mm_sub_epi8(px[3], pxConvertI8); /* convert back to i8 for px3 store */ + px[0] = _mm_shuffle_epi8(px[0], pxMask); /* shuffle to get [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + px[1] = _mm_shuffle_epi8(px[1], pxMask); /* shuffle to get [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + px[2] = _mm_shuffle_epi8(px[2], pxMask); /* shuffle to get [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + px[3] = _mm_shuffle_epi8(px[3], pxMask); /* shuffle to get [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + _mm_storeu_si128((__m128i *)dstPtr, px[0]); /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 12), px[1]); /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 24), px[2]); /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */ + _mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]); /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p) +{ + __m128i px[2]; + __m128i pxZero = _mm_setzero_si128(); + __m128i pxConvertI8 = _mm_set1_epi8((char)128); + + px[0] = _mm_loadu_si128((__m128i *)srcPtr); /* load pixels 0-15 */ + px[0] = _mm_add_epi8(px[0], pxConvertI8); /* convert to u8 for px0 compute */ + px[1] = _mm_unpackhi_epi8(px[0], pxZero); /* pixels 8-15 */ + px[0] = _mm_unpacklo_epi8(px[0], pxZero); /* pixels 0-7 */ + p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero)); /* pixels 0-3 */ + p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero)); /* pixels 4-7 */ + p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero)); /* pixels 8-11 */ + p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero)); /* pixels 12-15 */ + + return RPP_SUCCESS; +} + +inline RppStatus rpp_store16_f32_to_i8(Rpp8s *dstPtr, __m128 *p) +{ + __m128i px[4]; + __m128i pxConvertI8 = _mm_set1_epi8((char)128); + + px[0] = _mm_cvtps_epi32(p[0]); /* pixels 0-3 */ + px[1] = _mm_cvtps_epi32(p[1]); /* pixels 4-7 */ + px[2] = _mm_cvtps_epi32(p[2]); /* pixels 8-11 */ + px[3] = _mm_cvtps_epi32(p[3]); /* pixels 12-15 */ + px[0] = _mm_packus_epi32(px[0], px[1]); /* pixels 0-7 */ + px[1] = _mm_packus_epi32(px[2], px[3]); /* pixels 8-15 */ + px[0] = _mm_packus_epi16(px[0], px[1]); /* pixels 0-15 */ + px[0] = _mm_sub_epi8(px[0], pxConvertI8); /* convert back to i8 for px0 store */ + _mm_storeu_si128((__m128i *)dstPtr, px[0]); /* store pixels 0-15 */ + + return RPP_SUCCESS; +} + +template +inline RppStatus rpp_simd_load(FuncType &&rpp_simd_load_routine, ArgTypes&&... args) +{ + std::forward(rpp_simd_load_routine)(std::forward(args)...); + + return RPP_SUCCESS; +} + +template +inline RppStatus rpp_simd_store(FuncType &&rpp_simd_store_routine, ArgTypes&&... args) +{ + std::forward(rpp_simd_store_routine)(std::forward(args)...); + + return RPP_SUCCESS; +} + // Shuffle floats in `src` by using SSE2 `pshufd` instead of `shufps`, if possible. #define SIMD_SHUFFLE_PS(src, imm) \ - _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(src), imm)) + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(src), imm)) + +#define CHECK_SIMD 0 +#define FP_BITS 16 +#define FP_MUL (1< 2.414213562373095 */ - __m128 cmp0 = _mm_cmpgt_ps( x, _ps_atanrange_hi ); - /* x > 0.4142135623730950 */ - __m128 cmp1 = _mm_cmpgt_ps( x, _ps_atanrange_lo ); - - /* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */ - __m128 cmp2 = _mm_andnot_ps( cmp0, cmp1 ); - - /* -( 1.0/x ) */ - __m128 y0 = _mm_and_ps( cmp0, _ps_cephes_PIO2F ); - __m128 x0 = _mm_div_ps( _ps_1, x ); - x0 = _mm_xor_ps( x0, _ps_sign_mask ); - - __m128 y1 = _mm_and_ps( cmp2, _ps_cephes_PIO4F ); - /* (x-1.0)/(x+1.0) */ - __m128 x1_o = _mm_sub_ps( x, _ps_1 ); - __m128 x1_u = _mm_add_ps( x, _ps_1 ); - __m128 x1 = _mm_div_ps( x1_o, x1_u ); - - __m128 x2 = _mm_and_ps( cmp2, x1 ); - x0 = _mm_and_ps( cmp0, x0 ); - x2 = _mm_or_ps( x2, x0 ); - cmp1 = _mm_or_ps( cmp0, cmp2 ); - x2 = _mm_and_ps( cmp1, x2 ); - x = _mm_andnot_ps( cmp1, x ); - x = _mm_or_ps( x2, x ); - - y = _mm_or_ps( y0, y1 ); - - __m128 zz = _mm_mul_ps( x, x ); - __m128 acc = _ps_atancof_p0; - acc = _mm_mul_ps( acc, zz ); - acc = _mm_sub_ps( acc, _ps_atancof_p1 ); - acc = _mm_mul_ps( acc, zz ); - acc = _mm_add_ps( acc, _ps_atancof_p2 ); - acc = _mm_mul_ps( acc, zz ); - acc = _mm_sub_ps( acc, _ps_atancof_p3 ); - acc = _mm_mul_ps( acc, zz ); - acc = _mm_mul_ps( acc, x ); - acc = _mm_add_ps( acc, x ); - y = _mm_add_ps( y, acc ); - - /* update the sign */ - y = _mm_xor_ps( y, sign_bit ); - - return y; + __m128 sign_bit, y; + + sign_bit = x; + // Take the absolute value + x = _mm_and_ps( x, _ps_inv_sign_mask ); + // Extract the sign bit (upper one) + sign_bit = _mm_and_ps( sign_bit, _ps_sign_mask ); + + // Range reduction, init x and y depending on range + + // x > 2.414213562373095 + __m128 cmp0 = _mm_cmpgt_ps( x, _ps_atanrange_hi ); + // x > 0.4142135623730950 + __m128 cmp1 = _mm_cmpgt_ps( x, _ps_atanrange_lo ); + + // x > 0.4142135623730950 && !( x > 2.414213562373095 ) + __m128 cmp2 = _mm_andnot_ps( cmp0, cmp1 ); + + // -( 1.0/x ) + __m128 y0 = _mm_and_ps( cmp0, _ps_cephes_PIO2F ); + __m128 x0 = _mm_div_ps( _ps_1, x ); + x0 = _mm_xor_ps( x0, _ps_sign_mask ); + + __m128 y1 = _mm_and_ps( cmp2, _ps_cephes_PIO4F ); + // (x-1.0)/(x+1.0) + __m128 x1_o = _mm_sub_ps( x, _ps_1 ); + __m128 x1_u = _mm_add_ps( x, _ps_1 ); + __m128 x1 = _mm_div_ps( x1_o, x1_u ); + + __m128 x2 = _mm_and_ps( cmp2, x1 ); + x0 = _mm_and_ps( cmp0, x0 ); + x2 = _mm_or_ps( x2, x0 ); + cmp1 = _mm_or_ps( cmp0, cmp2 ); + x2 = _mm_and_ps( cmp1, x2 ); + x = _mm_andnot_ps( cmp1, x ); + x = _mm_or_ps( x2, x ); + + y = _mm_or_ps( y0, y1 ); + + __m128 zz = _mm_mul_ps( x, x ); + __m128 acc = _ps_atancof_p0; + acc = _mm_mul_ps( acc, zz ); + acc = _mm_sub_ps( acc, _ps_atancof_p1 ); + acc = _mm_mul_ps( acc, zz ); + acc = _mm_add_ps( acc, _ps_atancof_p2 ); + acc = _mm_mul_ps( acc, zz ); + acc = _mm_sub_ps( acc, _ps_atancof_p3 ); + acc = _mm_mul_ps( acc, zz ); + acc = _mm_mul_ps( acc, x ); + acc = _mm_add_ps( acc, x ); + y = _mm_add_ps( y, acc ); + + // Update the sign + y = _mm_xor_ps( y, sign_bit ); + + return y; } static inline __m128 atan2_ps( __m128 y, __m128 x ) { - __m128 x_eq_0 = _mm_cmpeq_ps( x, _ps_0 ); - __m128 x_gt_0 = _mm_cmpgt_ps( x, _ps_0 ); - __m128 x_le_0 = _mm_cmple_ps( x, _ps_0 ); - __m128 y_eq_0 = _mm_cmpeq_ps( y, _ps_0 ); - __m128 x_lt_0 = _mm_cmplt_ps( x, _ps_0 ); - __m128 y_lt_0 = _mm_cmplt_ps( y, _ps_0 ); - - __m128 zero_mask = _mm_and_ps( x_eq_0, y_eq_0 ); - __m128 zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 ); - zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case ); - - __m128 pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 ); - __m128 pio2_mask_sign = _mm_and_ps( y_lt_0, _ps_sign_mask ); - __m128 pio2_result = _ps_cephes_PIO2F; - pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign ); - pio2_result = _mm_and_ps( pio2_mask, pio2_result ); - - __m128 pi_mask = _mm_and_ps( y_eq_0, x_le_0 ); - __m128 pi = _ps_cephes_PIF; - __m128 pi_result = _mm_and_ps( pi_mask, pi ); - - __m128 swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 ); - swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, _ps_sign_mask ); - - __m128 offset0 = _mm_setzero_ps(); - __m128 offset1 = _ps_cephes_PIF; - offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset ); - - __m128 offset = _mm_andnot_ps( x_lt_0, offset0 ); - offset = _mm_and_ps( x_lt_0, offset1 ); - - __m128 arg = _mm_div_ps( y, x ); - __m128 atan_result = atan_ps( arg ); - atan_result = _mm_add_ps( atan_result, offset ); - - /* select between zero_result, pio2_result and atan_result */ - - __m128 result = _mm_andnot_ps( zero_mask, pio2_result ); - atan_result = _mm_andnot_ps( pio2_mask, atan_result ); - atan_result = _mm_andnot_ps( pio2_mask, atan_result); - result = _mm_or_ps( result, atan_result ); - result = _mm_or_ps( result, pi_result ); - - return result; + __m128 x_eq_0 = _mm_cmpeq_ps( x, _ps_0 ); + __m128 x_gt_0 = _mm_cmpgt_ps( x, _ps_0 ); + __m128 x_le_0 = _mm_cmple_ps( x, _ps_0 ); + __m128 y_eq_0 = _mm_cmpeq_ps( y, _ps_0 ); + __m128 x_lt_0 = _mm_cmplt_ps( x, _ps_0 ); + __m128 y_lt_0 = _mm_cmplt_ps( y, _ps_0 ); + + __m128 zero_mask = _mm_and_ps( x_eq_0, y_eq_0 ); + __m128 zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 ); + zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case ); + + __m128 pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 ); + __m128 pio2_mask_sign = _mm_and_ps( y_lt_0, _ps_sign_mask ); + __m128 pio2_result = _ps_cephes_PIO2F; + pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign ); + pio2_result = _mm_and_ps( pio2_mask, pio2_result ); + + __m128 pi_mask = _mm_and_ps( y_eq_0, x_le_0 ); + __m128 pi = _ps_cephes_PIF; + __m128 pi_result = _mm_and_ps( pi_mask, pi ); + + __m128 swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 ); + swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, _ps_sign_mask ); + + __m128 offset0 = _mm_setzero_ps(); + __m128 offset1 = _ps_cephes_PIF; + offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset ); + + __m128 offset = _mm_andnot_ps( x_lt_0, offset0 ); + offset = _mm_and_ps( x_lt_0, offset1 ); + + __m128 arg = _mm_div_ps( y, x ); + __m128 atan_result = atan_ps( arg ); + atan_result = _mm_add_ps( atan_result, offset ); + + // Select between zero_result, pio2_result and atan_result + + __m128 result = _mm_andnot_ps( zero_mask, pio2_result ); + atan_result = _mm_andnot_ps( pio2_mask, atan_result ); + atan_result = _mm_andnot_ps( pio2_mask, atan_result); + result = _mm_or_ps( result, atan_result ); + result = _mm_or_ps( result, pi_result ); + + return result; } -#endif #endif //AMD_RPP_RPP_CPU_SIMD_HPP \ No newline at end of file diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 0e6fbe06e..8394f0747 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -5,11 +5,58 @@ #include #include #include +#include #include #include #include -using half_float::half; -typedef half Rpp16f; +using halfhpp = half_float::half; +typedef halfhpp Rpp16f; + +typedef struct d_float8 +{ + float4 x; + float4 y; +} d_float8; + +typedef struct d_float24 +{ + d_float8 x; + d_float8 y; + d_float8 z; +} d_float24; + +typedef struct d_uint6 +{ + uint2 x; + uint2 y; + uint2 z; +} d_uint6; + +typedef struct d_int6 +{ + int2 x; + int2 y; + int2 z; +} d_int6; + +typedef struct d_half4 +{ + half2 x; + half2 y; +} d_half4; + +typedef struct d_half8 +{ + d_half4 x; + d_half4 y; +} d_half8; + +typedef struct d_half24 +{ + d_half8 x; + d_half8 y; + d_half8 z; +} d_half24; enum class RPPTensorDataType { @@ -44,6 +91,19 @@ struct RPPTensorFunctionMetaData } }; +#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c" +#define BYTE_TO_BINARY(byte) \ + (byte & 0x80 ? '1' : '0'), \ + (byte & 0x40 ? '1' : '0'), \ + (byte & 0x20 ? '1' : '0'), \ + (byte & 0x10 ? '1' : '0'), \ + (byte & 0x08 ? '1' : '0'), \ + (byte & 0x04 ? '1' : '0'), \ + (byte & 0x02 ? '1' : '0'), \ + (byte & 0x01 ? '1' : '0') + +/******************** HOST FUNCTIONS ********************/ + inline int getplnpkdind(RppiChnFormat &format) { return format == RPPI_CHN_PLANAR ? 1 : 3; @@ -72,4 +132,488 @@ inline RppStatus generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp return RPP_SUCCESS; } + +/******************** DEVICE FUNCTIONS ********************/ + +// -------------------- Set 1 - Packing -------------------- + +// Packing to U8s + +__device__ __forceinline__ uint rpp_hip_pack(float4 src) +{ + return __builtin_amdgcn_cvt_pk_u8_f32(src.w, 3, + __builtin_amdgcn_cvt_pk_u8_f32(src.z, 2, + __builtin_amdgcn_cvt_pk_u8_f32(src.y, 1, + __builtin_amdgcn_cvt_pk_u8_f32(src.x, 0, 0)))); +} + +// Packing to I8s + +__device__ __forceinline__ uint rpp_hip_pack_i8(float4 src) +{ + char4 dst_c4; + dst_c4.w = (signed char)(src.w); + dst_c4.z = (signed char)(src.z); + dst_c4.y = (signed char)(src.y); + dst_c4.x = (signed char)(src.x); + + return *(uint *)&dst_c4; +} + +// -------------------- Set 2 - Un-Packing -------------------- + +// Un-Packing from U8s + +__device__ __forceinline__ float rpp_hip_unpack0(uint src) +{ + return (float)(src & 0xFF); +} + +__device__ __forceinline__ float rpp_hip_unpack1(uint src) +{ + return (float)((src >> 8) & 0xFF); +} + +__device__ __forceinline__ float rpp_hip_unpack2(uint src) +{ + return (float)((src >> 16) & 0xFF); +} + +__device__ __forceinline__ float rpp_hip_unpack3(uint src) +{ + return (float)((src >> 24) & 0xFF); +} + +__device__ __forceinline__ float4 rpp_hip_unpack(uint src) +{ + return make_float4(rpp_hip_unpack0(src), rpp_hip_unpack1(src), rpp_hip_unpack2(src), rpp_hip_unpack3(src)); +} + +// Un-Packing from I8s + +__device__ __forceinline__ float rpp_hip_unpack0(int src) +{ + return (float)(signed char)(src & 0xFF); +} + +__device__ __forceinline__ float rpp_hip_unpack1(int src) +{ + return (float)(signed char)((src >> 8) & 0xFF); +} + +__device__ __forceinline__ float rpp_hip_unpack2(int src) +{ + return (float)(signed char)((src >> 16) & 0xFF); +} + +__device__ __forceinline__ float rpp_hip_unpack3(int src) +{ + return (float)(signed char)((src >> 24) & 0xFF); +} + +__device__ __forceinline__ float4 rpp_hip_unpack_from_i8(int src) +{ + return make_float4(rpp_hip_unpack0(src), rpp_hip_unpack1(src), rpp_hip_unpack2(src), rpp_hip_unpack3(src)); +} + +// -------------------- Set 3 - Loads -------------------- + +// U8 loads without layout toggle (8 U8 pixels) + +__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(uchar *srcPtr, uint srcIdx, d_float8 *src_f8) +{ + uint2 src = *((uint2 *)(&srcPtr[srcIdx])); + src_f8->x = rpp_hip_unpack(src.x); + src_f8->y = rpp_hip_unpack(src.y); +} + +// F32 loads without layout toggle (8 F32 pixels) + +__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(float *srcPtr, uint srcIdx, d_float8 *src_f8) +{ + *src_f8 = *((d_float8 *)(&srcPtr[srcIdx])); +} + +// I8 loads without layout toggle (8 I8 pixels) + +__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(signed char *srcPtr, uint srcIdx, d_float8 *src_f8) +{ + int2 src = *((int2 *)(&srcPtr[srcIdx])); + src_f8->x = rpp_hip_unpack_from_i8(src.x); + src_f8->y = rpp_hip_unpack_from_i8(src.y); +} + +// F16 loads without layout toggle (8 F16 pixels) + +__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(half *srcPtr, uint srcIdx, d_float8 *src_f8) +{ + d_half8 src_h8; + src_h8 = *((d_half8 *)(&srcPtr[srcIdx])); + + float2 src1_f2, src2_f2; + + src1_f2 = __half22float2(src_h8.x.x); + src2_f2 = __half22float2(src_h8.x.y); + src_f8->x = make_float4(src1_f2.x, src1_f2.y, src2_f2.x, src2_f2.y); + + src1_f2 = __half22float2(src_h8.y.x); + src2_f2 = __half22float2(src_h8.y.y); + src_f8->y = make_float4(src1_f2.x, src1_f2.y, src2_f2.x, src2_f2.y); +} + +// U8 loads with layout toggle PKD3 to PLN3 (24 U8 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(uchar *srcPtr, uint srcIdx, d_float24 *src_f24) +{ + d_uint6 src = *((d_uint6 *)(&srcPtr[srcIdx])); + + src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack2(src.x.y), rpp_hip_unpack1(src.y.x)); + src_f24->x.y = make_float4(rpp_hip_unpack0(src.y.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack2(src.z.x), rpp_hip_unpack1(src.z.y)); + + src_f24->y.x = make_float4(rpp_hip_unpack1(src.x.x), rpp_hip_unpack0(src.x.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack2(src.y.x)); + src_f24->y.y = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack0(src.z.x), rpp_hip_unpack3(src.z.x), rpp_hip_unpack2(src.z.y)); + + src_f24->z.x = make_float4(rpp_hip_unpack2(src.x.x), rpp_hip_unpack1(src.x.y), rpp_hip_unpack0(src.y.x), rpp_hip_unpack3(src.y.x)); + src_f24->z.y = make_float4(rpp_hip_unpack2(src.y.y), rpp_hip_unpack1(src.z.x), rpp_hip_unpack0(src.z.y), rpp_hip_unpack3(src.z.y)); +} + +// F32 loads with layout toggle PKD3 to PLN3 (24 F32 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(float *srcPtr, uint srcIdx, d_float24 *src_f24) +{ + d_float24 *srcPtr_f24; + srcPtr_f24 = (d_float24 *)&srcPtr[srcIdx]; + + src_f24->x.x.x = srcPtr_f24->x.x.x; + src_f24->y.x.x = srcPtr_f24->x.x.y; + src_f24->z.x.x = srcPtr_f24->x.x.z; + src_f24->x.x.y = srcPtr_f24->x.x.w; + src_f24->y.x.y = srcPtr_f24->x.y.x; + src_f24->z.x.y = srcPtr_f24->x.y.y; + src_f24->x.x.z = srcPtr_f24->x.y.z; + src_f24->y.x.z = srcPtr_f24->x.y.w; + + src_f24->z.x.z = srcPtr_f24->y.x.x; + src_f24->x.x.w = srcPtr_f24->y.x.y; + src_f24->y.x.w = srcPtr_f24->y.x.z; + src_f24->z.x.w = srcPtr_f24->y.x.w; + src_f24->x.y.x = srcPtr_f24->y.y.x; + src_f24->y.y.x = srcPtr_f24->y.y.y; + src_f24->z.y.x = srcPtr_f24->y.y.z; + src_f24->x.y.y = srcPtr_f24->y.y.w; + + src_f24->y.y.y = srcPtr_f24->z.x.x; + src_f24->z.y.y = srcPtr_f24->z.x.y; + src_f24->x.y.z = srcPtr_f24->z.x.z; + src_f24->y.y.z = srcPtr_f24->z.x.w; + src_f24->z.y.z = srcPtr_f24->z.y.x; + src_f24->x.y.w = srcPtr_f24->z.y.y; + src_f24->y.y.w = srcPtr_f24->z.y.z; + src_f24->z.y.w = srcPtr_f24->z.y.w; +} + +// I8 loads with layout toggle PKD3 to PLN3 (24 I8 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(signed char *srcPtr, uint srcIdx, d_float24 *src_f24) +{ + d_int6 src = *((d_int6 *)(&srcPtr[srcIdx])); + + src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack2(src.x.y), rpp_hip_unpack1(src.y.x)); + src_f24->x.y = make_float4(rpp_hip_unpack0(src.y.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack2(src.z.x), rpp_hip_unpack1(src.z.y)); + + src_f24->y.x = make_float4(rpp_hip_unpack1(src.x.x), rpp_hip_unpack0(src.x.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack2(src.y.x)); + src_f24->y.y = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack0(src.z.x), rpp_hip_unpack3(src.z.x), rpp_hip_unpack2(src.z.y)); + + src_f24->z.x = make_float4(rpp_hip_unpack2(src.x.x), rpp_hip_unpack1(src.x.y), rpp_hip_unpack0(src.y.x), rpp_hip_unpack3(src.y.x)); + src_f24->z.y = make_float4(rpp_hip_unpack2(src.y.y), rpp_hip_unpack1(src.z.x), rpp_hip_unpack0(src.z.y), rpp_hip_unpack3(src.z.y)); +} + +// F16 loads with layout toggle PKD3 to PLN3 (24 F16 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(half *srcPtr, uint srcIdx, d_float24 *src_f24) +{ + d_half24 *src_h24; + src_h24 = (d_half24 *)&srcPtr[srcIdx]; + + src_f24->x.x.x = __half2float(__low2half(src_h24->x.x.x)); + src_f24->x.x.y = __half2float(__high2half(src_h24->x.x.y)); + src_f24->x.x.z = __half2float(__low2half(src_h24->x.y.y)); + src_f24->x.x.w = __half2float(__high2half(src_h24->y.x.x)); + src_f24->x.y.x = __half2float(__low2half(src_h24->y.y.x)); + src_f24->x.y.y = __half2float(__high2half(src_h24->y.y.y)); + src_f24->x.y.z = __half2float(__low2half(src_h24->z.x.y)); + src_f24->x.y.w = __half2float(__high2half(src_h24->z.y.x)); + + src_f24->y.x.x = __half2float(__high2half(src_h24->x.x.x)); + src_f24->y.x.y = __half2float(__low2half(src_h24->x.y.x)); + src_f24->y.x.z = __half2float(__high2half(src_h24->x.y.y)); + src_f24->y.x.w = __half2float(__low2half(src_h24->y.x.y)); + src_f24->y.y.x = __half2float(__high2half(src_h24->y.y.x)); + src_f24->y.y.y = __half2float(__low2half(src_h24->z.x.x)); + src_f24->y.y.z = __half2float(__high2half(src_h24->z.x.y)); + src_f24->y.y.w = __half2float(__low2half(src_h24->z.y.y)); + + src_f24->z.x.x = __half2float(__low2half(src_h24->x.x.y)); + src_f24->z.x.y = __half2float(__high2half(src_h24->x.y.x)); + src_f24->z.x.z = __half2float(__low2half(src_h24->y.x.x)); + src_f24->z.x.w = __half2float(__high2half(src_h24->y.x.y)); + src_f24->z.y.x = __half2float(__low2half(src_h24->y.y.y)); + src_f24->z.y.y = __half2float(__high2half(src_h24->z.x.x)); + src_f24->z.y.z = __half2float(__low2half(src_h24->z.y.x)); + src_f24->z.y.w = __half2float(__high2half(src_h24->z.y.y)); +} + +// U8 loads with layout toggle PLN3 to PKD3 (24 U8 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(uchar *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24) +{ + d_uint6 src; + + src.x = *((uint2 *)(&srcPtr[srcIdx])); + srcIdx += increment; + src.y = *((uint2 *)(&srcPtr[srcIdx])); + srcIdx += increment; + src.z = *((uint2 *)(&srcPtr[srcIdx])); + + src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack0(src.y.x), rpp_hip_unpack0(src.z.x), rpp_hip_unpack1(src.x.x)); + src_f24->x.y = make_float4(rpp_hip_unpack1(src.y.x), rpp_hip_unpack1(src.z.x), rpp_hip_unpack2(src.x.x), rpp_hip_unpack2(src.y.x)); + src_f24->y.x = make_float4(rpp_hip_unpack2(src.z.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack3(src.y.x), rpp_hip_unpack3(src.z.x)); + src_f24->y.y = make_float4(rpp_hip_unpack0(src.x.y), rpp_hip_unpack0(src.y.y), rpp_hip_unpack0(src.z.y), rpp_hip_unpack1(src.x.y)); + src_f24->z.x = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack1(src.z.y), rpp_hip_unpack2(src.x.y), rpp_hip_unpack2(src.y.y)); + src_f24->z.y = make_float4(rpp_hip_unpack2(src.z.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack3(src.z.y)); +} + +// F32 loads with layout toggle PLN3 to PKD3 (24 F32 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(float *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24) +{ + float *srcPtrR, *srcPtrG, *srcPtrB; + srcPtrR = srcPtr + srcIdx; + srcPtrG = srcPtrR + increment; + srcPtrB = srcPtrG + increment; + + d_float8 *srcPtrR_f8, *srcPtrG_f8, *srcPtrB_f8; + + srcPtrR_f8 = (d_float8 *)srcPtrR; + srcPtrG_f8 = (d_float8 *)srcPtrG; + srcPtrB_f8 = (d_float8 *)srcPtrB; + + src_f24->x.x.x = srcPtrR_f8->x.x; + src_f24->x.x.y = srcPtrG_f8->x.x; + src_f24->x.x.z = srcPtrB_f8->x.x; + + src_f24->x.x.w = srcPtrR_f8->x.y; + src_f24->x.y.x = srcPtrG_f8->x.y; + src_f24->x.y.y = srcPtrB_f8->x.y; + + src_f24->x.y.z = srcPtrR_f8->x.z; + src_f24->x.y.w = srcPtrG_f8->x.z; + src_f24->y.x.x = srcPtrB_f8->x.z; + + src_f24->y.x.y = srcPtrR_f8->x.w; + src_f24->y.x.z = srcPtrG_f8->x.w; + src_f24->y.x.w = srcPtrB_f8->x.w; + + src_f24->y.y.x = srcPtrR_f8->y.x; + src_f24->y.y.y = srcPtrG_f8->y.x; + src_f24->y.y.z = srcPtrB_f8->y.x; + + src_f24->y.y.w = srcPtrR_f8->y.y; + src_f24->z.x.x = srcPtrG_f8->y.y; + src_f24->z.x.y = srcPtrB_f8->y.y; + + src_f24->z.x.z = srcPtrR_f8->y.z; + src_f24->z.x.w = srcPtrG_f8->y.z; + src_f24->z.y.x = srcPtrB_f8->y.z; + + src_f24->z.y.y = srcPtrR_f8->y.w; + src_f24->z.y.z = srcPtrG_f8->y.w; + src_f24->z.y.w = srcPtrB_f8->y.w; +} + +// I8 loads with layout toggle PLN3 to PKD3 (24 I8 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(signed char *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24) +{ + d_int6 src; + + src.x = *((int2 *)(&srcPtr[srcIdx])); + srcIdx += increment; + src.y = *((int2 *)(&srcPtr[srcIdx])); + srcIdx += increment; + src.z = *((int2 *)(&srcPtr[srcIdx])); + + src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack0(src.y.x), rpp_hip_unpack0(src.z.x), rpp_hip_unpack1(src.x.x)); + src_f24->x.y = make_float4(rpp_hip_unpack1(src.y.x), rpp_hip_unpack1(src.z.x), rpp_hip_unpack2(src.x.x), rpp_hip_unpack2(src.y.x)); + src_f24->y.x = make_float4(rpp_hip_unpack2(src.z.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack3(src.y.x), rpp_hip_unpack3(src.z.x)); + src_f24->y.y = make_float4(rpp_hip_unpack0(src.x.y), rpp_hip_unpack0(src.y.y), rpp_hip_unpack0(src.z.y), rpp_hip_unpack1(src.x.y)); + src_f24->z.x = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack1(src.z.y), rpp_hip_unpack2(src.x.y), rpp_hip_unpack2(src.y.y)); + src_f24->z.y = make_float4(rpp_hip_unpack2(src.z.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack3(src.z.y)); +} + +// F16 loads with layout toggle PLN3 to PKD3 (24 F16 pixels) + +__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(half *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24) +{ + half *srcPtrR, *srcPtrG, *srcPtrB; + srcPtrR = srcPtr + srcIdx; + srcPtrG = srcPtrR + increment; + srcPtrB = srcPtrG + increment; + + d_half8 *srcR_h8, *srcG_h8, *srcB_h8; + srcR_h8 = (d_half8 *)srcPtrR; + srcG_h8 = (d_half8 *)srcPtrG; + srcB_h8 = (d_half8 *)srcPtrB; + + src_f24->x.x.x = __half2float(__low2half(srcR_h8->x.x)); + src_f24->x.x.y = __half2float(__low2half(srcG_h8->x.x)); + src_f24->x.x.z = __half2float(__low2half(srcB_h8->x.x)); + + src_f24->x.x.w = __half2float(__high2half(srcR_h8->x.x)); + src_f24->x.y.x = __half2float(__high2half(srcG_h8->x.x)); + src_f24->x.y.y = __half2float(__high2half(srcB_h8->x.x)); + + src_f24->x.y.z = __half2float(__low2half(srcR_h8->x.y)); + src_f24->x.y.w = __half2float(__low2half(srcG_h8->x.y)); + src_f24->y.x.x = __half2float(__low2half(srcB_h8->x.y)); + + src_f24->y.x.y = __half2float(__high2half(srcR_h8->x.y)); + src_f24->y.x.z = __half2float(__high2half(srcG_h8->x.y)); + src_f24->y.x.w = __half2float(__high2half(srcB_h8->x.y)); + + src_f24->y.y.x = __half2float(__low2half(srcR_h8->y.x)); + src_f24->y.y.y = __half2float(__low2half(srcG_h8->y.x)); + src_f24->y.y.z = __half2float(__low2half(srcB_h8->y.x)); + + src_f24->y.y.w = __half2float(__high2half(srcR_h8->y.x)); + src_f24->z.x.x = __half2float(__high2half(srcG_h8->y.x)); + src_f24->z.x.y = __half2float(__high2half(srcB_h8->y.x)); + + src_f24->z.x.z = __half2float(__low2half(srcR_h8->y.y)); + src_f24->z.x.w = __half2float(__low2half(srcG_h8->y.y)); + src_f24->z.y.x = __half2float(__low2half(srcB_h8->y.y)); + + src_f24->z.y.y = __half2float(__high2half(srcR_h8->y.y)); + src_f24->z.y.z = __half2float(__high2half(srcG_h8->y.y)); + src_f24->z.y.w = __half2float(__high2half(srcB_h8->y.y)); +} + +// -------------------- Set 4 - Stores -------------------- + +// U8 stores without layout toggle (8 U8 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(uchar *dstPtr, uint dstIdx, d_float8 *dst_f8) +{ + uint2 dst; + dst.x = rpp_hip_pack(dst_f8->x); + dst.y = rpp_hip_pack(dst_f8->y); + *((uint2 *)(&dstPtr[dstIdx])) = dst; +} + +// F32 stores without layout toggle (8 F32 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(float *dstPtr, uint dstIdx, d_float8 *dst_f8) +{ + *((d_float8 *)(&dstPtr[dstIdx])) = *dst_f8; +} + +// I8 stores without layout toggle (8 I8 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(signed char *dstPtr, uint dstIdx, d_float8 *dst_f8) +{ + uint2 dst; + dst.x = rpp_hip_pack_i8(dst_f8->x); + dst.y = rpp_hip_pack_i8(dst_f8->y); + *((uint2 *)(&dstPtr[dstIdx])) = dst; +} + +// F16 stores without layout toggle (8 F16 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(half *dstPtr, uint dstIdx, d_float8 *dst_f8) +{ + d_half8 dst_h8; + + dst_h8.x.x = __float22half2_rn(make_float2(dst_f8->x.x, dst_f8->x.y)); + dst_h8.x.y = __float22half2_rn(make_float2(dst_f8->x.z, dst_f8->x.w)); + dst_h8.y.x = __float22half2_rn(make_float2(dst_f8->y.x, dst_f8->y.y)); + dst_h8.y.y = __float22half2_rn(make_float2(dst_f8->y.z, dst_f8->y.w)); + + *((d_half8 *)(&dstPtr[dstIdx])) = dst_h8; +} + +// U8 stores without layout toggle (24 U8 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(uchar *dstPtr, uint dstIdx, d_float24 *dst_f24) +{ + d_uint6 dst; + + dst.x.x = rpp_hip_pack(dst_f24->x.x); + dst.x.y = rpp_hip_pack(dst_f24->x.y); + dst.y.x = rpp_hip_pack(dst_f24->y.x); + dst.y.y = rpp_hip_pack(dst_f24->y.y); + dst.z.x = rpp_hip_pack(dst_f24->z.x); + dst.z.y = rpp_hip_pack(dst_f24->z.y); + + *((d_uint6 *)(&dstPtr[dstIdx])) = dst; +} + +// F32 stores without layout toggle (24 F32 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(float *dstPtr, uint dstIdx, d_float24 *dst_f24) +{ + *((d_float24 *)(&dstPtr[dstIdx])) = *dst_f24; +} + +// I8 stores without layout toggle (24 I8 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(signed char *dstPtr, uint dstIdx, d_float24 *dst_f24) +{ + d_uint6 dst; + + dst.x.x = rpp_hip_pack_i8(dst_f24->x.x); + dst.x.y = rpp_hip_pack_i8(dst_f24->x.y); + dst.y.x = rpp_hip_pack_i8(dst_f24->y.x); + dst.y.y = rpp_hip_pack_i8(dst_f24->y.y); + dst.z.x = rpp_hip_pack_i8(dst_f24->z.x); + dst.z.y = rpp_hip_pack_i8(dst_f24->z.y); + + *((d_uint6 *)(&dstPtr[dstIdx])) = dst; +} + +// F16 stores without layout toggle (24 F16 pixels) + +__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(half *dstPtr, uint dstIdx, d_float24 *dst_f24) +{ + d_half24 dst_h24; + + dst_h24.x.x.x = __float22half2_rn(make_float2(dst_f24->x.x.x, dst_f24->x.x.y)); + dst_h24.x.x.y = __float22half2_rn(make_float2(dst_f24->x.x.z, dst_f24->x.x.w)); + dst_h24.x.y.x = __float22half2_rn(make_float2(dst_f24->x.y.x, dst_f24->x.y.y)); + dst_h24.x.y.y = __float22half2_rn(make_float2(dst_f24->x.y.z, dst_f24->x.y.w)); + + dst_h24.y.x.x = __float22half2_rn(make_float2(dst_f24->y.x.x, dst_f24->y.x.y)); + dst_h24.y.x.y = __float22half2_rn(make_float2(dst_f24->y.x.z, dst_f24->y.x.w)); + dst_h24.y.y.x = __float22half2_rn(make_float2(dst_f24->y.y.x, dst_f24->y.y.y)); + dst_h24.y.y.y = __float22half2_rn(make_float2(dst_f24->y.y.z, dst_f24->y.y.w)); + + dst_h24.z.x.x = __float22half2_rn(make_float2(dst_f24->z.x.x, dst_f24->z.x.y)); + dst_h24.z.x.y = __float22half2_rn(make_float2(dst_f24->z.x.z, dst_f24->z.x.w)); + dst_h24.z.y.x = __float22half2_rn(make_float2(dst_f24->z.y.x, dst_f24->z.y.y)); + dst_h24.z.y.y = __float22half2_rn(make_float2(dst_f24->z.y.z, dst_f24->z.y.w)); + + *((d_half24 *)(&dstPtr[dstIdx])) = dst_h24; +} + +// -------------------- Set 5 - Other -------------------- + +// float4 pixel check for 0-255 range + +__device__ __forceinline__ float4 rpp_hip_pixel_check(float4 src_f4) +{ + return make_float4(fminf(fmaxf(src_f4.x, 0), 255), + fminf(fmaxf(src_f4.y, 0), 255), + fminf(fmaxf(src_f4.z, 0), 255), + fminf(fmaxf(src_f4.w, 0), 255)); +} + #endif //RPP_HIP_COMMON_H \ No newline at end of file diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt index 812af6f86..3446c93b7 100644 --- a/src/modules/CMakeLists.txt +++ b/src/modules/CMakeLists.txt @@ -74,7 +74,7 @@ if( "${BACKEND}" STREQUAL "HIP") # Set HIP compiler and flags set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -g3 -std=c++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -mfma -g3 -std=c++14") # Add HIP specific preprocessor flags add_definitions(-DHIP_COMPILE) @@ -94,7 +94,7 @@ elseif( "${BACKEND}" STREQUAL "OCL") # Set OpenCL compiler and flags set(CMAKE_CXX_COMPILER ${COMPILER_FOR_OPENCL}) # GCC and G++ donst work for creating .so file - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -g3 -std=c++14 -Wno-deprecated-declarations") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -mfma -g3 -std=c++14 -Wno-deprecated-declarations") # Add OpenCL specific preprocessor flags add_definitions(-DOCL_COMPILE) diff --git a/src/modules/cpu/host_tensor_augmentations.hpp b/src/modules/cpu/host_tensor_augmentations.hpp new file mode 100644 index 000000000..1831b9463 --- /dev/null +++ b/src/modules/cpu/host_tensor_augmentations.hpp @@ -0,0 +1,1068 @@ +/* +Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HOST_TENSOR_AUGMENTATIONS_HPP +#define HOST_TENSOR_AUGMENTATIONS_HPP + +#include "cpu/rpp_cpu_simd.hpp" +#include +#include +#include +#include + +/************ brightness ************/ + +RppStatus brightness_u8_u8_host_tensor(Rpp8u *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8u *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *alphaTensor, + Rpp32f *betaTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtr; + + if (&roiTensorPtrSrc[batchCount] == NULL) + { + roiPtr = roiPtrDefault; + } + else + { + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + + RpptROI roiImage; + RpptROIPtr roiPtrImage; + + if (roiType == RpptRoiType::LTRB) + { + roiPtrImage = &roiImage; + compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage); + } + else if (roiType == RpptRoiType::XYWH) + { + roiPtrImage = roiPtrInput; + } + + roiPtr = &roi; + compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault); + } + + Rpp32f alpha = alphaTensor[batchCount]; + Rpp32f beta = betaTensor[batchCount]; + + Rpp8u *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m128 pMul = _mm_set1_ps(alpha); + __m128 pAdd = _mm_set1_ps(beta); + + Rpp8u *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Brightness with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = bufferLength & ~47; + + Rpp8u *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=48) + { + __m128 p[12]; + + rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3, srcPtrTemp, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + p[3] = _mm_fmadd_ps(p[3], pMul, pAdd); // brightness adjustment + + p[4] = _mm_fmadd_ps(p[4], pMul, pAdd); // brightness adjustment + p[5] = _mm_fmadd_ps(p[5], pMul, pAdd); // brightness adjustment + p[6] = _mm_fmadd_ps(p[6], pMul, pAdd); // brightness adjustment + p[7] = _mm_fmadd_ps(p[7], pMul, pAdd); // brightness adjustment + + p[8] = _mm_fmadd_ps(p[8], pMul, pAdd); // brightness adjustment + p[9] = _mm_fmadd_ps(p[9], pMul, pAdd); // brightness adjustment + p[10] = _mm_fmadd_ps(p[10], pMul, pAdd); // brightness adjustment + p[11] = _mm_fmadd_ps(p[11], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store48_f32pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); + + srcPtrTemp += 48; + dstPtrTempR += 16; + dstPtrTempG += 16; + dstPtrTempB += 16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount+=3) + { + *dstPtrTempR = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta); + dstPtrTempR++; + srcPtrTemp++; + + *dstPtrTempG = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta); + dstPtrTempG++; + srcPtrTemp++; + + *dstPtrTempB = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta); + dstPtrTempB++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Brightness with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = bufferLength & ~47; + + Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=16) + { + __m128 p[12]; + + rpp_simd_load(rpp_load48_u8pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + p[3] = _mm_fmadd_ps(p[3], pMul, pAdd); // brightness adjustment + + p[4] = _mm_fmadd_ps(p[4], pMul, pAdd); // brightness adjustment + p[5] = _mm_fmadd_ps(p[5], pMul, pAdd); // brightness adjustment + p[6] = _mm_fmadd_ps(p[6], pMul, pAdd); // brightness adjustment + p[7] = _mm_fmadd_ps(p[7], pMul, pAdd); // brightness adjustment + + p[8] = _mm_fmadd_ps(p[8], pMul, pAdd); // brightness adjustment + p[9] = _mm_fmadd_ps(p[9], pMul, pAdd); // brightness adjustment + p[10] = _mm_fmadd_ps(p[10], pMul, pAdd); // brightness adjustment + p[11] = _mm_fmadd_ps(p[11], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3, dstPtrTemp, p); + + srcPtrTempR += 16; + srcPtrTempG += 16; + srcPtrTempB += 16; + dstPtrTemp += 48; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTempR)) * alpha) + beta); + dstPtrTemp++; + srcPtrTempR++; + + *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTempG)) * alpha) + beta); + dstPtrTemp++; + srcPtrTempG++; + + *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTempB)) * alpha) + beta); + dstPtrTemp++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + Rpp32u alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8u *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp8u *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=16) + { + __m128 p[4]; + + rpp_simd_load(rpp_load16_u8_to_f32, srcPtrTemp, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + p[3] = _mm_fmadd_ps(p[3], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store16_f32_to_u8, dstPtrTemp, p); + + srcPtrTemp +=16; + dstPtrTemp +=16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta); + + dstPtrTemp++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtrChannel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp32f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *alphaTensor, + Rpp32f *betaTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtr; + + if (&roiTensorPtrSrc[batchCount] == NULL) + { + roiPtr = roiPtrDefault; + } + else + { + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + + RpptROI roiImage; + RpptROIPtr roiPtrImage; + + if (roiType == RpptRoiType::LTRB) + { + roiPtrImage = &roiImage; + compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage); + } + else if (roiType == RpptRoiType::XYWH) + { + roiPtrImage = roiPtrInput; + } + + roiPtr = &roi; + compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault); + } + + Rpp32f alpha = alphaTensor[batchCount]; + Rpp32f beta = betaTensor[batchCount] * 0.0039216; // 1/255 + + Rpp32f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m128 pMul = _mm_set1_ps(alpha); + __m128 pAdd = _mm_set1_ps(beta); + + Rpp32f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Brightness with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = bufferLength & ~11; + + Rpp32f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=12) + { + __m128 p[4]; + + rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); + + srcPtrTemp += 12; + dstPtrTempR += 4; + dstPtrTempG += 4; + dstPtrTempB += 4; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount+=3) + { + *dstPtrTempR = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta); + dstPtrTempR++; + srcPtrTemp++; + + *dstPtrTempG = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta); + dstPtrTempG++; + srcPtrTemp++; + + *dstPtrTempB = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta); + dstPtrTempB++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Brightness with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = bufferLength & ~11; + + Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=4) + { + __m128 p[4]; + + rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp, p); + + srcPtrTempR += 4; + srcPtrTempG += 4; + srcPtrTempB += 4; + dstPtrTemp += 12; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTempR * alpha + beta); + dstPtrTemp++; + srcPtrTempR++; + + *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTempG * alpha + beta); + dstPtrTemp++; + srcPtrTempG++; + + *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTempB * alpha + beta); + dstPtrTemp++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + Rpp32u alignedLength = bufferLength & ~3; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp32f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp32f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=4) + { + __m128 p[1]; + + rpp_simd_load(rpp_load4_f32_to_f32, srcPtrTemp, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp, p); + + srcPtrTemp += 4; + dstPtrTemp += 4; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta); + + dstPtrTemp++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtrChannel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr, + RpptDescPtr srcDescPtr, + Rpp16f *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *alphaTensor, + Rpp32f *betaTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtr; + + if (&roiTensorPtrSrc[batchCount] == NULL) + { + roiPtr = roiPtrDefault; + } + else + { + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + + RpptROI roiImage; + RpptROIPtr roiPtrImage; + + if (roiType == RpptRoiType::LTRB) + { + roiPtrImage = &roiImage; + compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage); + } + else if (roiType == RpptRoiType::XYWH) + { + roiPtrImage = roiPtrInput; + } + + roiPtr = &roi; + compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault); + } + + Rpp32f alpha = alphaTensor[batchCount]; + Rpp32f beta = betaTensor[batchCount] * 0.0039216; // 1/255 + + Rpp16f *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m128 pMul = _mm_set1_ps(alpha); + __m128 pAdd = _mm_set1_ps(beta); + + Rpp16f *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Brightness with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = bufferLength & ~11; + + Rpp16f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=12) + { + Rpp32f srcPtrTemp_ps[12], dstPtrTemp_ps[12]; + + for(int cnt = 0; cnt < 12; cnt++) + { + *(srcPtrTemp_ps + cnt) = (Rpp32f) *(srcPtrTemp + cnt); + } + + __m128 p[4]; + + rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp_ps, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTemp_ps, dstPtrTemp_ps + 4, dstPtrTemp_ps + 8, p); + + for(int cnt = 0; cnt < 4; cnt++) + { + *(dstPtrTempR + cnt) = (Rpp16f) *(dstPtrTemp_ps + cnt); + *(dstPtrTempG + cnt) = (Rpp16f) *(dstPtrTemp_ps + 4 + cnt); + *(dstPtrTempB + cnt) = (Rpp16f) *(dstPtrTemp_ps + 8 + cnt); + } + + srcPtrTemp += 12; + dstPtrTempR += 4; + dstPtrTempG += 4; + dstPtrTempB += 4; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount+=3) + { + *dstPtrTempR = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta); + dstPtrTempR++; + srcPtrTemp++; + + *dstPtrTempG = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta); + dstPtrTempG++; + srcPtrTemp++; + + *dstPtrTempB = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta); + dstPtrTempB++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Brightness with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = bufferLength & ~11; + + Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=4) + { + Rpp32f srcPtrTemp_ps[12], dstPtrTemp_ps[13]; + + for(int cnt = 0; cnt < 4; cnt++) + { + *(srcPtrTemp_ps + cnt) = (Rpp32f) *(srcPtrTempR + cnt); + *(srcPtrTemp_ps + 4 + cnt) = (Rpp32f) *(srcPtrTempG + cnt); + *(srcPtrTemp_ps + 8 + cnt) = (Rpp32f) *(srcPtrTempB + cnt); + } + + __m128 p[4]; + + rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTemp_ps, srcPtrTemp_ps + 4, srcPtrTemp_ps + 8, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp_ps, p); + + for(int cnt = 0; cnt < 12; cnt++) + { + *(dstPtrTemp + cnt) = (Rpp16f) *(dstPtrTemp_ps + cnt); + } + + srcPtrTempR += 4; + srcPtrTempG += 4; + srcPtrTempB += 4; + dstPtrTemp += 12; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTempR * alpha + beta); + dstPtrTemp++; + srcPtrTempR++; + + *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTempG * alpha + beta); + dstPtrTemp++; + srcPtrTempG++; + + *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTempB * alpha + beta); + dstPtrTemp++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + Rpp32u alignedLength = bufferLength & ~3; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp16f *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp16f *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=4) + { + Rpp32f srcPtrTemp_ps[4], dstPtrTemp_ps[4]; + + for(int cnt = 0; cnt < 4; cnt++) + { + *(srcPtrTemp_ps + cnt) = (Rpp16f) *(srcPtrTemp + cnt); + } + + __m128 p[1]; + + rpp_simd_load(rpp_load4_f32_to_f32, srcPtrTemp_ps, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp_ps, p); + + for(int cnt = 0; cnt < 4; cnt++) + { + *(dstPtrTemp + cnt) = (Rpp16f) *(dstPtrTemp_ps + cnt); + } + + srcPtrTemp += 4; + dstPtrTemp += 4; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta); + + dstPtrTemp++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtrChannel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +RppStatus brightness_i8_i8_host_tensor(Rpp8s *srcPtr, + RpptDescPtr srcDescPtr, + Rpp8s *dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *alphaTensor, + Rpp32f *betaTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + RppLayoutParams layoutParams) +{ + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(srcDescPtr->n) + for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++) + { + RpptROI roi; + RpptROIPtr roiPtr; + + if (&roiTensorPtrSrc[batchCount] == NULL) + { + roiPtr = roiPtrDefault; + } + else + { + RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount]; + + RpptROI roiImage; + RpptROIPtr roiPtrImage; + + if (roiType == RpptRoiType::LTRB) + { + roiPtrImage = &roiImage; + compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage); + } + else if (roiType == RpptRoiType::XYWH) + { + roiPtrImage = roiPtrInput; + } + + roiPtr = &roi; + compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault); + } + + Rpp32f alpha = alphaTensor[batchCount]; + Rpp32f beta = betaTensor[batchCount]; + + Rpp8s *srcPtrImage, *dstPtrImage; + srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride; + dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride; + + Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier; + + __m128 pMul = _mm_set1_ps(alpha); + __m128 pAdd = _mm_set1_ps(beta); + + Rpp8s *srcPtrChannel, *dstPtrChannel; + srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier); + dstPtrChannel = dstPtrImage; + + // Brightness with fused output-layout toggle (NHWC -> NCHW) + if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + Rpp32u alignedLength = bufferLength & ~47; + + Rpp8s *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB; + srcPtrRow = srcPtrChannel; + dstPtrRowR = dstPtrChannel; + dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride; + dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB; + srcPtrTemp = srcPtrRow; + dstPtrTempR = dstPtrRowR; + dstPtrTempG = dstPtrRowG; + dstPtrTempB = dstPtrRowB; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=48) + { + __m128 p[12]; + + rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3, srcPtrTemp, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + p[3] = _mm_fmadd_ps(p[3], pMul, pAdd); // brightness adjustment + + p[4] = _mm_fmadd_ps(p[4], pMul, pAdd); // brightness adjustment + p[5] = _mm_fmadd_ps(p[5], pMul, pAdd); // brightness adjustment + p[6] = _mm_fmadd_ps(p[6], pMul, pAdd); // brightness adjustment + p[7] = _mm_fmadd_ps(p[7], pMul, pAdd); // brightness adjustment + + p[8] = _mm_fmadd_ps(p[8], pMul, pAdd); // brightness adjustment + p[9] = _mm_fmadd_ps(p[9], pMul, pAdd); // brightness adjustment + p[10] = _mm_fmadd_ps(p[10], pMul, pAdd); // brightness adjustment + p[11] = _mm_fmadd_ps(p[11], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store48_f32pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); + + srcPtrTemp += 48; + dstPtrTempR += 16; + dstPtrTempG += 16; + dstPtrTempB += 16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount+=3) + { + *dstPtrTempR = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128); + dstPtrTempR++; + srcPtrTemp++; + + *dstPtrTempG = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128); + dstPtrTempG++; + srcPtrTemp++; + + *dstPtrTempB = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128); + dstPtrTempB++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRowR += dstDescPtr->strides.hStride; + dstPtrRowG += dstDescPtr->strides.hStride; + dstPtrRowB += dstDescPtr->strides.hStride; + } + } + + // Brightness with fused output-layout toggle (NCHW -> NHWC) + else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + Rpp32u alignedLength = bufferLength & ~47; + + Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow; + srcPtrRowR = srcPtrChannel; + srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride; + srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp; + srcPtrTempR = srcPtrRowR; + srcPtrTempG = srcPtrRowG; + srcPtrTempB = srcPtrRowB; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=16) + { + __m128 p[12]; + + rpp_simd_load(rpp_load48_i8pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + p[3] = _mm_fmadd_ps(p[3], pMul, pAdd); // brightness adjustment + + p[4] = _mm_fmadd_ps(p[4], pMul, pAdd); // brightness adjustment + p[5] = _mm_fmadd_ps(p[5], pMul, pAdd); // brightness adjustment + p[6] = _mm_fmadd_ps(p[6], pMul, pAdd); // brightness adjustment + p[7] = _mm_fmadd_ps(p[7], pMul, pAdd); // brightness adjustment + + p[8] = _mm_fmadd_ps(p[8], pMul, pAdd); // brightness adjustment + p[9] = _mm_fmadd_ps(p[9], pMul, pAdd); // brightness adjustment + p[10] = _mm_fmadd_ps(p[10], pMul, pAdd); // brightness adjustment + p[11] = _mm_fmadd_ps(p[11], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3, dstPtrTemp, p); + + srcPtrTempR += 16; + srcPtrTempG += 16; + srcPtrTempB += 16; + dstPtrTemp += 48; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTempR) + 128) * alpha) + beta - 128); + dstPtrTemp++; + srcPtrTempR++; + + *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTempG) + 128) * alpha) + beta - 128); + dstPtrTemp++; + srcPtrTempG++; + + *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTempB) + 128) * alpha) + beta - 128); + dstPtrTemp++; + srcPtrTempB++; + } + + srcPtrRowR += srcDescPtr->strides.hStride; + srcPtrRowG += srcDescPtr->strides.hStride; + srcPtrRowB += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + } + + // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) + else + { + Rpp32u alignedLength = bufferLength & ~15; + + for(int c = 0; c < layoutParams.channelParam; c++) + { + Rpp8s *srcPtrRow, *dstPtrRow; + srcPtrRow = srcPtrChannel; + dstPtrRow = dstPtrChannel; + + for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++) + { + Rpp8s *srcPtrTemp, *dstPtrTemp; + srcPtrTemp = srcPtrRow; + dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; + for (; vectorLoopCount < alignedLength; vectorLoopCount+=16) + { + __m128 p[4]; + + rpp_simd_load(rpp_load16_i8_to_f32, srcPtrTemp, p); + + p[0] = _mm_fmadd_ps(p[0], pMul, pAdd); // brightness adjustment + p[1] = _mm_fmadd_ps(p[1], pMul, pAdd); // brightness adjustment + p[2] = _mm_fmadd_ps(p[2], pMul, pAdd); // brightness adjustment + p[3] = _mm_fmadd_ps(p[3], pMul, pAdd); // brightness adjustment + + rpp_simd_store(rpp_store16_f32_to_i8, dstPtrTemp, p); + + srcPtrTemp +=16; + dstPtrTemp +=16; + } + for (; vectorLoopCount < bufferLength; vectorLoopCount++) + { + *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128); + + dstPtrTemp++; + srcPtrTemp++; + } + + srcPtrRow += srcDescPtr->strides.hStride; + dstPtrRow += dstDescPtr->strides.hStride; + } + + srcPtrChannel += srcDescPtr->strides.cStride; + dstPtrChannel += dstDescPtr->strides.cStride; + } + } + } + + return RPP_SUCCESS; +} + +#endif // HOST_TENSOR_AUGMENTATIONS_HPP diff --git a/src/modules/hip/hip_tensor_augmentations.hpp b/src/modules/hip/hip_tensor_augmentations.hpp new file mode 100644 index 000000000..d91f0281e --- /dev/null +++ b/src/modules/hip/hip_tensor_augmentations.hpp @@ -0,0 +1,30 @@ +#include "hip/hip_runtime_api.h" +#include "kernel/brightness.hpp" +#include "kernel/roi_conversion.hpp" + +/******************** brightness ********************/ + +template +RppStatus brightness_hip_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rpp::Handle& handle) +{ + if (roiType == RpptRoiType::LTRB) + { + hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, + handle); + } + + hip_exec_brightness_tensor(srcPtr, + srcDescPtr, + dstPtr, + dstDescPtr, + roiTensorPtrSrc, + handle); + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/brightness.hpp b/src/modules/hip/kernel/brightness.hpp new file mode 100644 index 000000000..6426dc039 --- /dev/null +++ b/src/modules/hip/kernel/brightness.hpp @@ -0,0 +1,288 @@ +#include +#include "hip/rpp_hip_common.hpp" + +__device__ void brightness_hip_compute(uchar *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4) +{ + dst_f8->x = src_f8->x * *alpha_f4 + *beta_f4; + dst_f8->y = src_f8->y * *alpha_f4 + *beta_f4; +} + +__device__ void brightness_hip_compute(float *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4) +{ + dst_f8->x = src_f8->x * *alpha_f4 + *beta_f4 * (float4)0.0039216; + dst_f8->y = src_f8->y * *alpha_f4 + *beta_f4 * (float4)0.0039216; +} + +__device__ void brightness_hip_compute(signed char *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4) +{ + dst_f8->x = rpp_hip_pixel_check((src_f8->x + (float4)128) * *alpha_f4 + *beta_f4) - (float4)128; + dst_f8->y = rpp_hip_pixel_check((src_f8->y + (float4)128) * *alpha_f4 + *beta_f4) - (float4)128; +} + +__device__ void brightness_hip_compute(half *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4) +{ + dst_f8->x = src_f8->x * *alpha_f4 + *beta_f4 * (float4)0.0039216; + dst_f8->y = src_f8->y * *alpha_f4 + *beta_f4 * (float4)0.0039216; +} + +template +__global__ void brightness_pkd_tensor(T *srcPtr, + int nStrideSrc, + int hStrideSrc, + T *dstPtr, + int nStrideDst, + int hStrideDst, + float *alpha, + float *beta, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth * 3)) + { + return; + } + + uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x * 3); + uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x; + + float4 alpha_f4 = (float4)alpha[id_z]; + float4 beta_f4 = (float4)beta[id_z]; + + d_float8 src_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8); + brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8); +} + +template +__global__ void brightness_pln_tensor(T *srcPtr, + int nStrideSrc, + int cStrideSrc, + int hStrideSrc, + T *dstPtr, + int nStrideDst, + int cStrideDst, + int hStrideDst, + int channelsDst, + float *alpha, + float *beta, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x; + + float4 alpha_f4 = (float4)(alpha[id_z]); + float4 beta_f4 = (float4)(beta[id_z]); + + d_float8 src_f8, dst_f8; + + rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8); + brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8); + + if (channelsDst == 3) + { + srcIdx += cStrideSrc; + dstIdx += cStrideDst; + + rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8); + brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8); + + srcIdx += cStrideSrc; + dstIdx += cStrideDst; + + rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8); + brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8); + } +} + +template +__global__ void brightness_pkd3_pln3_tensor(T *srcPtr, + int nStrideSrc, + int hStrideSrc, + T *dstPtr, + int nStrideDst, + int cStrideDst, + int hStrideDst, + float *alpha, + float *beta, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3); + uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x; + + float4 alpha_f4 = (float4)alpha[id_z]; + float4 beta_f4 = (float4)beta[id_z]; + + d_float24 src_f24, dst_f24; + + rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr, srcIdx, &src_f24); + brightness_hip_compute(srcPtr, &src_f24.x, &dst_f24.x, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f24.x); + + dstIdx += cStrideDst; + + brightness_hip_compute(srcPtr, &src_f24.y, &dst_f24.y, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f24.y); + + dstIdx += cStrideDst; + + brightness_hip_compute(srcPtr, &src_f24.z, &dst_f24.z, &alpha_f4, &beta_f4); + rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f24.z); +} + +template +__global__ void brightness_pln3_pkd3_tensor(T *srcPtr, + int nStrideSrc, + int cStrideSrc, + int hStrideSrc, + T *dstPtr, + int nStrideDst, + int hStrideDst, + float *alpha, + float *beta, + RpptROIPtr roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; + + if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth)) + { + return; + } + + uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x); + uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x * 3; + + float4 alpha_f4 = (float4)(alpha[id_z]); + float4 beta_f4 = (float4)(beta[id_z]); + + d_float24 src_f24, dst_f24; + + rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr, srcIdx, cStrideSrc, &src_f24); + brightness_hip_compute(srcPtr, &src_f24.x, &dst_f24.x, &alpha_f4, &beta_f4); + brightness_hip_compute(srcPtr, &src_f24.y, &dst_f24.y, &alpha_f4, &beta_f4); + brightness_hip_compute(srcPtr, &src_f24.z, &dst_f24.z, &alpha_f4, &beta_f4); + rpp_hip_pack_float24_and_store24(dstPtr, dstIdx, &dst_f24); +} + +template +RppStatus hip_exec_brightness_tensor(T *srcPtr, + RpptDescPtr srcDescPtr, + T *dstPtr, + RpptDescPtr dstDescPtr, + RpptROIPtr roiTensorPtrSrc, + rpp::Handle& handle) +{ + int localThreads_x = 16; + int localThreads_y = 16; + int localThreads_z = 1; + int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3; + int globalThreads_y = dstDescPtr->h; + int globalThreads_z = handle.GetBatchSize(); + + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + hipLaunchKernelGGL(brightness_pkd_tensor, + dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + srcDescPtr->strides.hStride, + dstPtr, + dstDescPtr->strides.nStride, + dstDescPtr->strides.hStride, + handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem, + handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(brightness_pln_tensor, + dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + srcDescPtr->strides.cStride, + srcDescPtr->strides.hStride, + dstPtr, + dstDescPtr->strides.nStride, + dstDescPtr->strides.cStride, + dstDescPtr->strides.hStride, + dstDescPtr->c, + handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem, + handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem, + roiTensorPtrSrc); + } + else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3)) + { + if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW)) + { + hipLaunchKernelGGL(brightness_pkd3_pln3_tensor, + dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + srcDescPtr->strides.hStride, + dstPtr, + dstDescPtr->strides.nStride, + dstDescPtr->strides.cStride, + dstDescPtr->strides.hStride, + handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem, + handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem, + roiTensorPtrSrc); + } + else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC)) + { + globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3; + hipLaunchKernelGGL(brightness_pln3_pkd3_tensor, + dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + srcPtr, + srcDescPtr->strides.nStride, + srcDescPtr->strides.cStride, + srcDescPtr->strides.hStride, + dstPtr, + dstDescPtr->strides.nStride, + dstDescPtr->strides.hStride, + handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem, + handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem, + roiTensorPtrSrc); + } + } + + return RPP_SUCCESS; +} diff --git a/src/modules/hip/kernel/roi_conversion.hpp b/src/modules/hip/kernel/roi_conversion.hpp new file mode 100644 index 000000000..8f61e4cc2 --- /dev/null +++ b/src/modules/hip/kernel/roi_conversion.hpp @@ -0,0 +1,32 @@ +#include + +extern "C" __global__ void roi_converison_ltrb_to_xywh(int *roiTensorPtrSrc) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 4; + + int4 *roiTensorPtrSrc_i4; + roiTensorPtrSrc_i4 = (int4 *)&roiTensorPtrSrc[id_x]; + + roiTensorPtrSrc_i4->z -= (roiTensorPtrSrc_i4->x - 1); + roiTensorPtrSrc_i4->w -= (roiTensorPtrSrc_i4->y - 1); +} + +RppStatus hip_exec_roi_converison_ltrb_to_xywh(RpptROIPtr roiTensorPtrSrc, + rpp::Handle& handle) +{ + int localThreads_x = 256; + int localThreads_y = 1; + int localThreads_z = 1; + int globalThreads_x = handle.GetBatchSize(); + int globalThreads_y = 1; + int globalThreads_z = 1; + + hipLaunchKernelGGL(roi_converison_ltrb_to_xywh, + dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)), + dim3(localThreads_x, localThreads_y, localThreads_z), + 0, + handle.GetStream(), + (int *) roiTensorPtrSrc); + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/kernel/rpp_hip_host_decls.hpp b/src/modules/hip/kernel/rpp_hip_host_decls.hpp index 3d8f77e64..0cd13a1d6 100644 --- a/src/modules/hip/kernel/rpp_hip_host_decls.hpp +++ b/src/modules/hip/kernel/rpp_hip_host_decls.hpp @@ -187,5 +187,9 @@ RppStatus hip_exec_thresholding_batch(Rpp8u *srcPtr, Rpp8u *dstPtr, rpp::Handle& RppStatus hip_exec_min_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width); RppStatus hip_exec_max_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width); +// helpers + +RppStatus hip_exec_roi_converison_ltrb_to_xywh(RpptROIPtr roiTensorPtrSrc, rpp::Handle& handle); + #endif //RPP_HIP_HOST_DECLS_H \ No newline at end of file diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp index 62f1f1fb5..8ec8071a9 100644 --- a/src/modules/rppi_validate.hpp +++ b/src/modules/rppi_validate.hpp @@ -12,82 +12,109 @@ #include #endif +inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels) +{ + RppLayoutParams layoutParams; + if(layout == RpptLayout::NCHW) + { + if (channels == 1) // PLN1 + { + layoutParams.channelParam = 1; + layoutParams.bufferMultiplier = 1; + } + else if (channels == 3) // PLN3 + { + layoutParams.channelParam = 3; + layoutParams.bufferMultiplier = 1; + } + } + else if(layout == RpptLayout::NHWC) + { + if (channels == 3) // PKD3 + { + layoutParams.channelParam = 1; + layoutParams.bufferMultiplier = 3; + } + } + + return layoutParams; +} + inline void copy_srcSize(RppiSize srcSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize.height; - handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize.width; + handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize.height; + handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize.width; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.width, 0, NULL, NULL); } - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.width, handle.GetInitHandle()->mem.mgpu.csrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_srcSize(RppiSize *srcSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize[i].height; - handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize[i].width; + handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize[i].height; + handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize[i].width; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.width, 0, NULL, NULL); } - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.width, handle.GetInitHandle()->mem.mgpu.csrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_dstSize(RppiSize *dstSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize[i].height; - handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize[i].width; + handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize[i].height; + handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize[i].width; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.width, 0, NULL, NULL); } - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.height, handle.GetInitHandle()->mem.mgpu.cdstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.width, handle.GetInitHandle()->mem.mgpu.cdstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } - inline void copy_host_srcSize(RppiSize srcSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mcpu.srcSize[i].height = srcSize.height; - handle.GetInitHandle()->mem.mcpu.srcSize[i].width = srcSize.width; - } + handle.GetInitHandle()->mem.mcpu.srcSize[i].height = srcSize.height; + handle.GetInitHandle()->mem.mcpu.srcSize[i].width = srcSize.width; + } } inline void copy_host_dstSize(RppiSize dstSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mcpu.dstSize[i].height = dstSize.height; - handle.GetInitHandle()->mem.mcpu.dstSize[i].width = dstSize.width; + handle.GetInitHandle()->mem.mcpu.dstSize[i].height = dstSize.height; + handle.GetInitHandle()->mem.mcpu.dstSize[i].width = dstSize.width; } } @@ -95,38 +122,39 @@ inline void copy_host_maxSrcSize(RppiSize maxSrcSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].height = maxSrcSize.height; - handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].width = maxSrcSize.width; + handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].height = maxSrcSize.height; + handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].width = maxSrcSize.width; } } + inline void copy_host_maxDstSize(RppiSize maxDstSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mcpu.maxDstSize[i].height = maxDstSize.height; - handle.GetInitHandle()->mem.mcpu.maxDstSize[i].width = maxDstSize.width; + handle.GetInitHandle()->mem.mcpu.maxDstSize[i].height = maxDstSize.height; + handle.GetInitHandle()->mem.mcpu.maxDstSize[i].width = maxDstSize.width; } } + inline void copy_dstSize(RppiSize dstSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize() ; i++) { - handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize.height; - handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize.width; + handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize.height; + handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize.width; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.width, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.height, handle.GetInitHandle()->mem.mgpu.cdstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.width, handle.GetInitHandle()->mem.mgpu.cdstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_host_roi(RppiROI roiPoints, rpp::Handle& handle) @@ -155,7 +183,7 @@ inline void copy_roi(RppiROI roiPoints, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize(); i++) { - #if defined(OCL_COMPILE) || defined (HIP_COMPILE) +#if defined(OCL_COMPILE) || defined (HIP_COMPILE) { if(roiPoints.roiHeight == 0 && roiPoints.roiWidth == 0) { @@ -168,24 +196,23 @@ inline void copy_roi(RppiROI roiPoints, rpp::Handle& handle) handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints.roiWidth + roiPoints.x; } } - #else +#else { handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight[i] = roiPoints.roiHeight; handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints.roiWidth; } - #endif +#endif handle.GetInitHandle()->mem.mgpu.croiPoints.x[i] = roiPoints.x; handle.GetInitHandle()->mem.mgpu.croiPoints.y[i] = roiPoints.y; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.x, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.x, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.y, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.y, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); @@ -193,14 +220,14 @@ inline void copy_roi(RppiROI roiPoints, rpp::Handle& handle) hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.y, handle.GetInitHandle()->mem.mgpu.croiPoints.y, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_roi(RppiROI *roiPoints, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize(); i++) { - #if defined(OCL_COMPILE) || defined (HIP_COMPILE) +#if defined(OCL_COMPILE) || defined (HIP_COMPILE) { if(roiPoints[i].roiHeight == 0 && roiPoints[i].roiWidth == 0) { @@ -213,31 +240,30 @@ inline void copy_roi(RppiROI *roiPoints, rpp::Handle& handle) handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints[i].roiWidth + roiPoints[i].x; } } - #else +#else { handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight[i] = roiPoints[i].roiHeight; handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints[i].roiWidth; } - #endif +#endif handle.GetInitHandle()->mem.mgpu.croiPoints.x[i] = roiPoints[i].x; handle.GetInitHandle()->mem.mgpu.croiPoints.y[i] = roiPoints[i].y; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.x, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.x, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.y, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.y, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.x, handle.GetInitHandle()->mem.mgpu.croiPoints.x, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.y, handle.GetInitHandle()->mem.mgpu.croiPoints.y, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_float(float param, rpp::Handle& handle, Rpp32u paramIndex) @@ -246,16 +272,15 @@ inline void copy_param_float(float param, rpp::Handle& handle, Rpp32u paramIndex { handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem[i] = param; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, CL_FALSE, 0, sizeof(Rpp32f) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, sizeof(Rpp32f) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_float(float *param, rpp::Handle& handle, Rpp32u paramIndex) @@ -264,16 +289,15 @@ inline void copy_param_float(float *param, rpp::Handle& handle, Rpp32u paramInde { handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem[i] = param[i]; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, CL_FALSE, 0, sizeof(Rpp32f) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, sizeof(Rpp32f) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_uint(uint param, rpp::Handle& handle, Rpp32u paramIndex) @@ -282,16 +306,15 @@ inline void copy_param_uint(uint param, rpp::Handle& handle, Rpp32u paramIndex) { handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem[i] = param; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_uint(uint *param, rpp::Handle& handle, Rpp32u paramIndex) @@ -300,16 +323,15 @@ inline void copy_param_uint(uint *param, rpp::Handle& handle, Rpp32u paramIndex) { handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem[i] = param[i]; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_int(int param, rpp::Handle& handle, Rpp32u paramIndex) @@ -318,16 +340,15 @@ inline void copy_param_int(int param, rpp::Handle& handle, Rpp32u paramIndex) { handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem[i] = param; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, CL_FALSE, 0, sizeof(Rpp32s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, sizeof(Rpp32s) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_int(int *param, rpp::Handle& handle, Rpp32u paramIndex) @@ -336,16 +357,15 @@ inline void copy_param_int(int *param, rpp::Handle& handle, Rpp32u paramIndex) { handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem[i] = param[i]; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, CL_FALSE, 0, sizeof(Rpp32s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, sizeof(Rpp32s) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_uchar(Rpp8u param, rpp::Handle& handle, Rpp32u paramIndex) @@ -354,16 +374,15 @@ inline void copy_param_uchar(Rpp8u param, rpp::Handle& handle, Rpp32u paramIndex { handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem[i] = param; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, CL_FALSE, 0, sizeof(Rpp8u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, sizeof(Rpp8u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_uchar(Rpp8u *param, rpp::Handle& handle, Rpp32u paramIndex) @@ -372,16 +391,15 @@ inline void copy_param_uchar(Rpp8u *param, rpp::Handle& handle, Rpp32u paramInde { handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem[i] = param[i]; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, CL_FALSE, 0, sizeof(Rpp8u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, sizeof(Rpp8u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_char(char param, rpp::Handle& handle, Rpp32u paramIndex) @@ -390,16 +408,15 @@ inline void copy_param_char(char param, rpp::Handle& handle, Rpp32u paramIndex) { handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem[i] = param; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, CL_FALSE, 0, sizeof(Rpp8s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, sizeof(Rpp8s) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_param_char(char *param, rpp::Handle& handle, Rpp32u paramIndex) @@ -408,19 +425,15 @@ inline void copy_param_char(char *param, rpp::Handle& handle, Rpp32u paramIndex) { handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem[i] = param[i]; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, CL_FALSE, 0, sizeof(Rpp8s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, sizeof(Rpp8s) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif -} -inline void copy_host_srcMaxSize(rpp::Handle& handle){ - +#endif } inline void copy_srcMaxSize(rpp::Handle& handle) @@ -430,20 +443,17 @@ inline void copy_srcMaxSize(rpp::Handle& handle) handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] = handle.GetInitHandle()->mem.mgpu.csrcSize.height[i]; handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] = handle.GetInitHandle()->mem.mgpu.csrcSize.width[i]; } - // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.height, handle.GetInitHandle()->mem.mcpu.srcSize.height, sizeof(Rpp32u) * handle.GetBatchSize()); - // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.width, handle.GetInitHandle()->mem.mcpu.srcSize.width, sizeof(Rpp32u) * handle.GetBatchSize()); - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_dstMaxSize(rpp::Handle& handle) @@ -453,23 +463,19 @@ inline void copy_dstMaxSize(rpp::Handle& handle) handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height[i] = handle.GetInitHandle()->mem.mgpu.cdstSize.height[i]; handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width[i] = handle.GetInitHandle()->mem.mgpu.cdstSize.width[i]; } - // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.height, handle.GetInitHandle()->mem.mcpu.srcSize.height, sizeof(Rpp32u) * handle.GetBatchSize()); - // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.width, handle.GetInitHandle()->mem.mcpu.srcSize.width, sizeof(Rpp32u) * handle.GetBatchSize()); - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.height, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.width, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } - inline void copy_srcMaxSize(RppiSize maxSrcSize, rpp::Handle& handle) { for(int i = 0; i < handle.GetBatchSize(); i++) @@ -477,18 +483,17 @@ inline void copy_srcMaxSize(RppiSize maxSrcSize, rpp::Handle& handle) handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] = maxSrcSize.height; handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] = maxSrcSize.width; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } inline void copy_dstMaxSize(RppiSize maxDstSize, rpp::Handle& handle) @@ -498,28 +503,26 @@ inline void copy_dstMaxSize(RppiSize maxDstSize, rpp::Handle& handle) handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height[i] = maxDstSize.height; handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width[i] = maxDstSize.width; } - #ifdef OCL_COMPILE +#ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, 0, NULL, NULL); clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, 0, NULL, NULL); } - // for hip - #elif defined(HIP_COMPILE) +#elif defined(HIP_COMPILE) { hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.height, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.width, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice); } - #endif +#endif } - inline void get_srcBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChnFormat chnFormat, bool is_padded = true) { int i; handle.GetInitHandle()->mem.mcpu.srcBatchIndex[0] = 0; for(i =0; i < handle.GetBatchSize() - 1 ; i++) { - handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i+1] = handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i] + handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] * channel; + handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i+1] = handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i] + handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] * channel; } for(i =0; i < handle.GetBatchSize() ; i++) { @@ -535,7 +538,6 @@ inline void get_srcBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChn handle.GetInitHandle()->mem.mcpu.inc[i] = handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i]; } } - #ifdef OCL_COMPILE { @@ -572,7 +574,6 @@ inline void get_dstBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChn handle.GetInitHandle()->mem.mcpu.dstInc[i] = handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width[i]; } } - #ifdef OCL_COMPILE { clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstBatchIndex, CL_FALSE, 0, sizeof(Rpp64u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.dstBatchIndex, 0, NULL, NULL); @@ -586,7 +587,6 @@ inline void get_dstBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChn #endif } - template inline void copy_luptr(Rpp8u *luptr,Rpp8u * batch_luptr,Rpp32u nbatchSize, int channel) { @@ -601,7 +601,6 @@ inline void copy_luptr(Rpp8u *luptr,Rpp8u * batch_luptr,Rpp32u nbatchSize, int c } } - template inline void copy_kernel(Rpp32f *kernel,Rpp32f * batch_kernel, Rpp32u nbatchSize, unsigned int size) { @@ -616,167 +615,165 @@ inline void copy_kernel(Rpp32f *kernel,Rpp32f * batch_kernel, Rpp32u nbatchSize, } } - -inline void validate_image_size(RppiSize imgSize){ - if(!(imgSize.width >= 0) || !(imgSize.height >= 0)){ - // std::cerr<<"\nImage width and height should be positive "<= 0) || !(imgSize.height >= 0)) + { exit(0); } } -inline void validate_float_range(Rpp32f min, Rpp32f max, Rpp32f *value) { - if( !(*value <= max) || !(*value >= min)){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = max; - //std::cerr<<"\nSetting the value to "<= min)){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = max; - //std::cerr<<"\nSetting the value to "<= min)){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = max; - //std::cerr<<"\nSetting the value to "<= min)){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = max; - //std::cerr<<"\nSetting the value to "<= min) ){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = min; - //std::cerr<<"\nSetting the value to "<= min) ){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = min; - //std::cerr<<"\nSetting the value to "<= min) ){ - //std::cerr<<"\nOut of bounds: "<<*value<= min)) + { *value = min; - //std::cerr<<"\nSetting the value to "< +#include +#include "rppi_validate.hpp" + +#ifdef HIP_COMPILE + #include "hip/hip_tensor_augmentations.hpp" +#elif defined(OCL_COMPILE) + #include + #include "cl/cl_declarations.hpp" +#endif //backend + +#include +#include +#include +#include +using namespace std::chrono; + +#include "cpu/host_tensor_augmentations.hpp" + +RppStatus +rppt_brightness_gpu(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *alphaTensor, + Rpp32f *betaTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ +#ifdef OCL_COMPILE + +#elif defined (HIP_COMPILE) + + Rpp32u paramIndex = 0; + copy_param_float(alphaTensor, rpp::deref(rppHandle), paramIndex++); + copy_param_float(betaTensor, rpp::deref(rppHandle), paramIndex++); + + if (srcDescPtr->dataType == RpptDataType::U8) + { + if (dstDescPtr->dataType == RpptDataType::U8) + { + brightness_hip_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + } + else if (srcDescPtr->dataType == RpptDataType::F16) + { + if (dstDescPtr->dataType == RpptDataType::F16) + { + brightness_hip_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + } + else if (srcDescPtr->dataType == RpptDataType::F32) + { + if (dstDescPtr->dataType == RpptDataType::F32) + { + brightness_hip_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + } + else if (srcDescPtr->dataType == RpptDataType::I8) + { + if (dstDescPtr->dataType == RpptDataType::I8) + { + brightness_hip_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + roiTensorPtrSrc, + roiType, + rpp::deref(rppHandle)); + } + } + +#endif //BACKEND + + return RPP_SUCCESS; +} + +RppStatus +rppt_brightness_host(RppPtr_t srcPtr, + RpptDescPtr srcDescPtr, + RppPtr_t dstPtr, + RpptDescPtr dstDescPtr, + Rpp32f *alphaTensor, + Rpp32f *betaTensor, + RpptROIPtr roiTensorPtrSrc, + RpptRoiType roiType, + rppHandle_t rppHandle) +{ + RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c); + + if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8)) + { + brightness_u8_u8_host_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + alphaTensor, + betaTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16)) + { + brightness_f16_f16_host_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + alphaTensor, + betaTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32)) + { + brightness_f32_f32_host_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + alphaTensor, + betaTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8)) + { + brightness_i8_i8_host_tensor(static_cast(srcPtr) + srcDescPtr->offset, + srcDescPtr, + static_cast(dstPtr) + dstDescPtr->offset, + dstDescPtr, + alphaTensor, + betaTensor, + roiTensorPtrSrc, + roiType, + layoutParams); + } + + return RPP_SUCCESS; +} diff --git a/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt b/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt index bd5254b85..917c9be34 100644 --- a/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt +++ b/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt @@ -23,12 +23,18 @@ if (hip_FOUND) add_executable(BatchPD_hip_pkd3 BatchPD_hip_pkd3.cpp) add_executable(BatchPD_hip_pln1 BatchPD_hip_pln1.cpp) add_executable(BatchPD_hip_pln3 BatchPD_hip_pln3.cpp) + add_executable(Tensor_hip_pkd3 Tensor_hip_pkd3.cpp) + add_executable(Tensor_hip_pln1 Tensor_hip_pln1.cpp) + add_executable(Tensor_hip_pln3 Tensor_hip_pln3.cpp) # add_executable(Single_hip Single_hip.cpp) add_executable(uniqueFunctionalities_hip uniqueFunctionalities_hip.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DHIP_COMPILE=1 -DRPP_BACKEND_HIP=1 -std=c++11") target_link_libraries(BatchPD_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) target_link_libraries(BatchPD_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) target_link_libraries(BatchPD_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) + target_link_libraries(Tensor_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) + target_link_libraries(Tensor_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) + target_link_libraries(Tensor_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) # target_link_libraries(Single_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) target_link_libraries(uniqueFunctionalities_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) endif() \ No newline at end of file diff --git a/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp new file mode 100644 index 000000000..5742e925c --- /dev/null +++ b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp @@ -0,0 +1,635 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 7; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_hip_pkd3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[6]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]); + printf("\ncase number (1:7) = %s", argv[5]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + int ip_bitDepth = atoi(argv[3]); + unsigned int outputFormatToggle = atoi(argv[4]); + int test_case = atoi(argv[5]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HIP_PKD3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + // outputFormatToggle = 0; + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NHWC; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NCHW; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst; + hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI)); + hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + srcDescPtr->c = ip_channel; + + dstDescPtr->n = noOfImages; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + dstDescPtr->c = ip_channel; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w; + srcDescPtr->strides.wStride = ip_channel; + srcDescPtr->strides.cStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize 8u host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += srcDescPtr->strides.hStride; + input_second_temp += srcDescPtr->strides.hStride; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths and copy to hip buffers + + half *inputf16, *inputf16_second, *outputf16; + Rpp32f *inputf32, *inputf32_second, *outputf32; + Rpp8s *inputi8, *inputi8_second, *outputi8; + int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second; + int *d_output, *d_outputf16, *d_outputf32, *d_outputi8; + + if (ip_bitDepth == 0) + { + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 1) + { + inputf16 = (half *)calloc(ioBufferSize, sizeof(half)); + inputf16_second = (half *)calloc(ioBufferSize, sizeof(half)); + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + + Rpp8u *inputTemp, *input_secondTemp; + half *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = (half)(((float)*inputTemp) / 255.0); + *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0); + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + + hipMalloc(&d_inputf16, ioBufferSize * sizeof(half)); + hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 2) + { + inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + + hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 3) + { + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 4) + { + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 5) + { + inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + + hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 6) + { + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + hipStream_t stream; + hipStreamCreate(&stream); + rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages); + + clock_t start, end; + double max_time_used = 0, min_time_used = 500, avg_time_used = 0; + + string test_case_name; + + printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages); + + for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++) + { + double gpu_time_used; + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 30; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 210; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 210; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + + hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice); + + start = clock(); + + if (ip_bitDepth == 0) + rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + + end = clock(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + if (gpu_time_used > max_time_used) + max_time_used = gpu_time_used; + if (gpu_time_used < min_time_used) + min_time_used = gpu_time_used; + avg_time_used += gpu_time_used; + } + + avg_time_used /= 100; + cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl; + + rppDestroyGPU(handle); + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + hipFree(d_roiTensorPtrSrc); + hipFree(d_roiTensorPtrDst); + free(input); + free(input_second); + free(output); + + if (ip_bitDepth == 0) + { + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_output); + } + else if (ip_bitDepth == 1) + { + free(inputf16); + free(inputf16_second); + free(outputf16); + hipFree(d_inputf16); + hipFree(d_inputf16_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 2) + { + free(inputf32); + free(inputf32_second); + free(outputf32); + hipFree(d_inputf32); + hipFree(d_inputf32_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 3) + { + free(outputf16); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 4) + { + free(outputf32); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 5) + { + free(inputi8); + free(inputi8_second); + free(outputi8); + hipFree(d_inputi8); + hipFree(d_inputi8_second); + hipFree(d_outputi8); + } + else if (ip_bitDepth == 6) + { + free(outputi8); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputi8); + } + + return 0; +} diff --git a/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp new file mode 100644 index 000000000..8b402ce2c --- /dev/null +++ b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp @@ -0,0 +1,632 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include +#include "helpers/testSuite_helper.hpp" + +using namespace cv; +using namespace std; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 7; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pln1 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + if (atoi(argv[5]) != 0) + { + printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n"); + return -1; + } + + if (atoi(argv[6]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]); + printf("\ncase number (1:7) = %s", argv[5]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + int ip_bitDepth = atoi(argv[3]); + unsigned int outputFormatToggle = atoi(argv[4]); + int test_case = atoi(argv[5]); + + int ip_channel = 1; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PLN1_toPLN1"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + outputFormatToggle = 0; + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst; + hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI)); + hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 0); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 0); + image_second = imread(temp_second, 0); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths and copy to hip buffers + + half *inputf16, *inputf16_second, *outputf16; + Rpp32f *inputf32, *inputf32_second, *outputf32; + Rpp8s *inputi8, *inputi8_second, *outputi8; + int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second; + int *d_output, *d_outputf16, *d_outputf32, *d_outputi8; + + if (ip_bitDepth == 0) + { + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 1) + { + inputf16 = (half *)calloc(ioBufferSize, sizeof(half)); + inputf16_second = (half *)calloc(ioBufferSize, sizeof(half)); + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + + Rpp8u *inputTemp, *input_secondTemp; + half *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = (half)(((float)*inputTemp) / 255.0); + *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0); + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + + hipMalloc(&d_inputf16, ioBufferSize * sizeof(half)); + hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 2) + { + inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + + hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 3) + { + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 4) + { + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 5) + { + inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + + hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 6) + { + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + hipStream_t stream; + hipStreamCreate(&stream); + rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages); + + clock_t start, end; + double max_time_used = 0, min_time_used = 500, avg_time_used = 0; + + string test_case_name; + + printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages); + + for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++) + { + double gpu_time_used; + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 30; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 210; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 210; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice); + + start = clock(); + + if (ip_bitDepth == 0) + rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + + end = clock(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + if (gpu_time_used > max_time_used) + max_time_used = gpu_time_used; + if (gpu_time_used < min_time_used) + min_time_used = gpu_time_used; + avg_time_used += gpu_time_used; + } + + avg_time_used /= 100; + cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl; + + rppDestroyGPU(handle); + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + hipFree(d_roiTensorPtrSrc); + hipFree(d_roiTensorPtrDst); + free(input); + free(input_second); + free(output); + + if (ip_bitDepth == 0) + { + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_output); + } + else if (ip_bitDepth == 1) + { + free(inputf16); + free(inputf16_second); + free(outputf16); + hipFree(d_inputf16); + hipFree(d_inputf16_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 2) + { + free(inputf32); + free(inputf32_second); + free(outputf32); + hipFree(d_inputf32); + hipFree(d_inputf32_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 3) + { + free(outputf16); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 4) + { + free(outputf32); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 5) + { + free(inputi8); + free(inputi8_second); + free(outputi8); + hipFree(d_inputi8); + hipFree(d_inputi8_second); + hipFree(d_outputi8); + } + else if (ip_bitDepth == 6) + { + free(outputi8); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputi8); + } + + return 0; +} diff --git a/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp new file mode 100644 index 000000000..73082fcef --- /dev/null +++ b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp @@ -0,0 +1,710 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 7; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pln3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[6]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]); + printf("\ncase number (1:7) = %s", argv[5]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + int ip_bitDepth = atoi(argv[3]); + unsigned int outputFormatToggle = atoi(argv[4]); + int test_case = atoi(argv[5]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PLN3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + // outputFormatToggle = 0; + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NHWC; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst; + hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI)); + hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert default OpenCV PKD3 to PLN3 for first input batch + + Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputTemp, *inputCopyTemp; + inputTemp = input; + inputCopyTemp = inputCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputTempR, *inputTempG, *inputTempB; + inputTempR = inputTemp; + inputTempG = inputTempR + srcDescPtr->strides.cStride; + inputTempB = inputTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputTempR = *inputCopyTemp; + inputCopyTemp++; + inputTempR++; + *inputTempG = *inputCopyTemp; + inputCopyTemp++; + inputTempG++; + *inputTempB = *inputCopyTemp; + inputCopyTemp++; + inputTempB++; + } + } + + inputTemp += srcDescPtr->strides.nStride; + } + + free(inputCopy); + + // Convert default OpenCV PKD3 to PLN3 for second input batch + + Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputSecondTemp, *inputSecondCopyTemp; + inputSecondTemp = input_second; + inputSecondCopyTemp = inputSecondCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB; + inputSecondTempR = inputSecondTemp; + inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride; + inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputSecondTempR = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempR++; + *inputSecondTempG = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempG++; + *inputSecondTempB = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempB++; + } + } + + inputSecondTemp += srcDescPtr->strides.nStride; + } + + free(inputSecondCopy); + + // Convert inputs to test various other bit depths and copy to hip buffers + + half *inputf16, *inputf16_second, *outputf16; + Rpp32f *inputf32, *inputf32_second, *outputf32; + Rpp8s *inputi8, *inputi8_second, *outputi8; + int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second; + int *d_output, *d_outputf16, *d_outputf32, *d_outputi8; + + if (ip_bitDepth == 0) + { + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 1) + { + inputf16 = (half *)calloc(ioBufferSize, sizeof(half)); + inputf16_second = (half *)calloc(ioBufferSize, sizeof(half)); + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + + Rpp8u *inputTemp, *input_secondTemp; + half *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = (half)(((float)*inputTemp) / 255.0); + *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0); + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + + hipMalloc(&d_inputf16, ioBufferSize * sizeof(half)); + hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 2) + { + inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + + hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 3) + { + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 4) + { + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 5) + { + inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + + hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 6) + { + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + hipStream_t stream; + hipStreamCreate(&stream); + rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages); + + clock_t start, end; + double max_time_used = 0, min_time_used = 500, avg_time_used = 0; + + string test_case_name; + + printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages); + + for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++) + { + double gpu_time_used; + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 30; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 210; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 210; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice); + + start = clock(); + + if (ip_bitDepth == 0) + rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + + end = clock(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + if (gpu_time_used > max_time_used) + max_time_used = gpu_time_used; + if (gpu_time_used < min_time_used) + min_time_used = gpu_time_used; + avg_time_used += gpu_time_used; + } + + avg_time_used /= 100; + cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl; + + rppDestroyGPU(handle); + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + hipFree(d_roiTensorPtrSrc); + hipFree(d_roiTensorPtrDst); + free(input); + free(input_second); + free(output); + + if (ip_bitDepth == 0) + { + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_output); + } + else if (ip_bitDepth == 1) + { + free(inputf16); + free(inputf16_second); + free(outputf16); + hipFree(d_inputf16); + hipFree(d_inputf16_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 2) + { + free(inputf32); + free(inputf32_second); + free(outputf32); + hipFree(d_inputf32); + hipFree(d_inputf32_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 3) + { + free(outputf16); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 4) + { + free(outputf32); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 5) + { + free(inputi8); + free(inputi8_second); + free(outputi8); + hipFree(d_inputi8); + hipFree(d_inputi8_second); + hipFree(d_outputi8); + } + else if (ip_bitDepth == 6) + { + free(outputi8); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputi8); + } + + return 0; +} diff --git a/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py b/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py index 698de4166..4026a0e7c 100644 --- a/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py +++ b/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py @@ -31,7 +31,10 @@ log_file_list = [ "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pkd3_hip_raw_performance_log.txt", "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pln3_hip_raw_performance_log.txt", - "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pln1_hip_raw_performance_log.txt" + "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pln1_hip_raw_performance_log.txt", + "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/Tensor_hip_pkd3_hip_raw_performance_log.txt", + "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/Tensor_hip_pln3_hip_raw_performance_log.txt", + "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/Tensor_hip_pln1_hip_raw_performance_log.txt" ] functionality_group_list = [ @@ -137,15 +140,20 @@ def func_group_finder(case_number): RESULTS_DIR = "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW" print("RESULTS_DIR = " + RESULTS_DIR) - CONSOLIDATED_FILE_PKD3 = RESULTS_DIR + "/consolidated_results_pkd3.stats.csv" - CONSOLIDATED_FILE_PLN1 = RESULTS_DIR + "/consolidated_results_pln1.stats.csv" - CONSOLIDATED_FILE_PLN3 = RESULTS_DIR + "/consolidated_results_pln3.stats.csv" - - TYPE_LIST = ["PKD3", "PLN1", "PLN3"] + CONSOLIDATED_FILE_BATCHPD_PKD3 = RESULTS_DIR + "/consolidated_results_BatchPD_PKD3.stats.csv" + CONSOLIDATED_FILE_BATCHPD_PLN1 = RESULTS_DIR + "/consolidated_results_BatchPD_PLN1.stats.csv" + CONSOLIDATED_FILE_BATCHPD_PLN3 = RESULTS_DIR + "/consolidated_results_BatchPD_PLN3.stats.csv" + CONSOLIDATED_FILE_TENSOR_PKD3 = RESULTS_DIR + "/consolidated_results_Tensor_PKD3.stats.csv" + CONSOLIDATED_FILE_TENSOR_PLN1 = RESULTS_DIR + "/consolidated_results_Tensor_PLN1.stats.csv" + CONSOLIDATED_FILE_TENSOR_PLN3 = RESULTS_DIR + "/consolidated_results_Tensor_PLN3.stats.csv" + + TYPE_LIST = ["BatchPD_PKD3", "BatchPD_PLN1", "BatchPD_PLN3", "Tensor_PKD3", "Tensor_PLN1", "Tensor_PLN3"] + BATCHPD_TYPE_LIST = ["BatchPD_PKD3", "BatchPD_PLN1", "BatchPD_PLN3"] + TENSOR_TYPE_LIST = ["Tensor_PKD3", "Tensor_PLN1", "Tensor_PLN3"] CASE_NUM_LIST = range(int(caseStart), int(caseEnd) + 1, 1) BIT_DEPTH_LIST = range(0, 7, 1) OFT_LIST = range(0, 2, 1) - d_counter = {"PKD3":0, "PLN1":0, "PLN3":0} + d_counter = {"BatchPD_PKD3":0, "BatchPD_PLN1":0, "BatchPD_PLN3":0, "Tensor_PKD3":0, "Tensor_PLN1":0, "Tensor_PLN3":0} for TYPE in TYPE_LIST: @@ -161,9 +169,9 @@ def func_group_finder(case_number): # Add functionality group header if CASE_NUM in NEW_FUNC_GROUP_LIST: FUNC_GROUP = func_group_finder(CASE_NUM) - new_file.write(" ,0,0,0,0\n") + new_file.write("0,0,0,0,0\n") new_file.write(FUNC_GROUP + ",0,0,0,0\n") - new_file.write(" ,0,0,0,0\n") + new_file.write("0,0,0,0,0\n") # Set results directory CASE_RESULTS_DIR = RESULTS_DIR + "/" + TYPE + "/case_" + str(CASE_NUM) @@ -183,10 +191,14 @@ def func_group_finder(case_number): for line in case_file: print(line) if not(line.startswith('"Name"')): - if prev != line.split(",")[0]: + if TYPE in TENSOR_TYPE_LIST: new_file.write(line) - prev = line.split(",")[0] d_counter[TYPE] = d_counter[TYPE] + 1 + elif TYPE in BATCHPD_TYPE_LIST: + if prev != line.split(",")[0]: + new_file.write(line) + prev = line.split(",")[0] + d_counter[TYPE] = d_counter[TYPE] + 1 case_file.close() except IOError: print("Unable to open case results") @@ -212,7 +224,13 @@ def func_group_finder(case_number): print(dfPrint_noIndices) except ImportError: - print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + CONSOLIDATED_FILE_PKD3 + "\n" + CONSOLIDATED_FILE_PLN1 + "\n" + CONSOLIDATED_FILE_PLN3 + "\n") + print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + \ + CONSOLIDATED_FILE_BATCHPD_PKD3 + "\n" + \ + CONSOLIDATED_FILE_BATCHPD_PLN1 + "\n" + \ + CONSOLIDATED_FILE_BATCHPD_PLN3 + "\n" + \ + CONSOLIDATED_FILE_TENSOR_PKD3 + "\n" + \ + CONSOLIDATED_FILE_TENSOR_PLN1 + "\n" + \ + CONSOLIDATED_FILE_TENSOR_PLN3 + "\n") except IOError: print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv") diff --git a/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh b/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh index bcfe0e0e6..2b0f9f412 100755 --- a/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh +++ b/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh @@ -134,9 +134,12 @@ make -j16 if [[ "$PROFILING_OPTION" -eq 1 ]] then - mkdir "$DST_FOLDER/PKD3" - mkdir "$DST_FOLDER/PLN1" - mkdir "$DST_FOLDER/PLN3" + mkdir "$DST_FOLDER/BatchPD_PKD3" + mkdir "$DST_FOLDER/BatchPD_PLN1" + mkdir "$DST_FOLDER/BatchPD_PLN3" + mkdir "$DST_FOLDER/Tensor_PKD3" + mkdir "$DST_FOLDER/Tensor_PLN1" + mkdir "$DST_FOLDER/Tensor_PLN3" fi printf "\n\n\n\n\n" @@ -170,10 +173,22 @@ do ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pkd3_hip_raw_performance_log.txt" elif [[ "$PROFILING_OPTION" -eq 1 ]] then - mkdir "$DST_FOLDER/PKD3/case_$case" - printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/PKD3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" - rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/PKD3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pkd3_hip_raw_performance_log.txt" + mkdir "$DST_FOLDER/BatchPD_PKD3/case_$case" + printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/BatchPD_PKD3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/BatchPD_PKD3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pkd3_hip_raw_performance_log.txt" fi + + if [[ "$PROFILING_OPTION" -eq 0 ]] + then + printf "\n./Tensor_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pkd3_hip_raw_performance_log.txt" + elif [[ "$PROFILING_OPTION" -eq 1 ]] + then + mkdir "$DST_FOLDER/Tensor_PKD3/case_$case" + printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/Tensor_PKD3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./Tensor_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/Tensor_PKD3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./Tensor_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pkd3_hip_raw_performance_log.txt" + fi + echo "------------------------------------------------------------------------------------------" done done @@ -214,10 +229,22 @@ do ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln1_hip_raw_performance_log.txt" elif [[ "$PROFILING_OPTION" -eq 1 ]] then - mkdir "$DST_FOLDER/PLN1/case_$case" - printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/PLN1/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" - rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/PLN1/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln1_hip_raw_performance_log.txt" + mkdir "$DST_FOLDER/BatchPD_PLN1/case_$case" + printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/BatchPD_PLN1/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/BatchPD_PLN1/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln1_hip_raw_performance_log.txt" fi + + if [[ "$PROFILING_OPTION" -eq 0 ]] + then + printf "\n./Tensor_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln1_hip_raw_performance_log.txt" + elif [[ "$PROFILING_OPTION" -eq 1 ]] + then + mkdir "$DST_FOLDER/Tensor_PLN1/case_$case" + printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/Tensor_PLN1/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./Tensor_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/Tensor_PLN1/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./Tensor_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln1_hip_raw_performance_log.txt" + fi + echo "------------------------------------------------------------------------------------------" done done @@ -258,10 +285,22 @@ do ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln3_hip_raw_performance_log.txt" elif [[ "$PROFILING_OPTION" -eq 1 ]] then - mkdir "$DST_FOLDER/PLN3/case_$case" - printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/PLN3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" - rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/PLN3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln3_hip_raw_performance_log.txt" + mkdir "$DST_FOLDER/BatchPD_PLN3/case_$case" + printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/BatchPD_PLN3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/BatchPD_PLN3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln3_hip_raw_performance_log.txt" + fi + + if [[ "$PROFILING_OPTION" -eq 0 ]] + then + printf "\n./Tensor_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln3_hip_raw_performance_log.txt" + elif [[ "$PROFILING_OPTION" -eq 1 ]] + then + mkdir "$DST_FOLDER/Tensor_PLN3/case_$case" + printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/Tensor_PLN3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./Tensor_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/Tensor_PLN3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./Tensor_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln3_hip_raw_performance_log.txt" fi + echo "------------------------------------------------------------------------------------------" done done diff --git a/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt b/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt index 15d7f51ca..94b2656c7 100644 --- a/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt +++ b/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt @@ -7,7 +7,7 @@ find_package(OpenCV REQUIRED) find_package(AMDRPP QUIET) if(NOT OpenCL_FOUND) - message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}") + message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}") endif() if (OpenCL_FOUND) @@ -19,12 +19,16 @@ if (OpenCL_FOUND) add_executable(BatchPD_host_pkd3 BatchPD_host_pkd3.cpp) add_executable(BatchPD_host_pln1 BatchPD_host_pln1.cpp) add_executable(BatchPD_host_pln3 BatchPD_host_pln3.cpp) + add_executable(Tensor_host_pkd3 Tensor_host_pkd3.cpp) + add_executable(Tensor_host_pln1 Tensor_host_pln1.cpp) + add_executable(Tensor_host_pln3 Tensor_host_pln3.cpp) # add_executable(Single_host Single_host.cpp) - add_executable(uniqueFunctionalities_host uniqueFunctionalities_host.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DOCL_COMPILE=1 -DRPP_BACKEND_OPENCL=1 -std=c++11") target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) + target_link_libraries(Tensor_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) + target_link_libraries(Tensor_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) + target_link_libraries(Tensor_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) # target_link_libraries(Single_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system ) - target_link_libraries(uniqueFunctionalities_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) endif() \ No newline at end of file diff --git a/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp new file mode 100644 index 000000000..29b627755 --- /dev/null +++ b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp @@ -0,0 +1,519 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; +using half_float::half; + +typedef half Rpp16f; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 7; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pkd3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[6]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]); + printf("\ncase number (1:7) = %s", argv[5]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + int ip_bitDepth = atoi(argv[3]); + unsigned int outputFormatToggle = atoi(argv[4]); + int test_case = atoi(argv[5]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PKD3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NHWC; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NCHW; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + srcDescPtr->c = ip_channel; + + dstDescPtr->n = noOfImages; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + dstDescPtr->c = ip_channel; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w; + srcDescPtr->strides.wStride = ip_channel; + srcDescPtr->strides.cStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + + Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + + Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += srcDescPtr->strides.hStride; + input_second_temp += srcDescPtr->strides.hStride; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths + + if (ip_bitDepth == 1) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp16f *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0; + *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0; + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + } + else if (ip_bitDepth == 2) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + } + else if (ip_bitDepth == 5) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + rppCreateWithBatchSize(&handle, noOfImages); + + double max_time_used = 0, min_time_used = 500, avg_time_used = 0; + + string test_case_name; + + printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages); + + for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++) + { + clock_t start, end; + double start_omp, end_omp; + double cpu_time_used, omp_time_used; + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 50; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 199; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 149; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + start_omp = omp_get_wtime(); + start = clock(); + if (ip_bitDepth == 0) + rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + end = clock(); + end_omp = omp_get_wtime(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + omp_time_used = end_omp - start_omp; + if (cpu_time_used > max_time_used) + max_time_used = cpu_time_used; + if (cpu_time_used < min_time_used) + min_time_used = cpu_time_used; + avg_time_used += cpu_time_used; + } + + avg_time_used /= 100; + + // Display measured times + + cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl; + + rppDestroyHost(handle); + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + free(input); + free(input_second); + free(output); + free(inputf16); + free(inputf16_second); + free(outputf16); + free(inputf32); + free(inputf32_second); + free(outputf32); + free(inputi8); + free(inputi8_second); + free(outputi8); + + return 0; +} diff --git a/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp new file mode 100644 index 000000000..cb5c9801d --- /dev/null +++ b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp @@ -0,0 +1,517 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include +#include "helpers/testSuite_helper.hpp" + +using namespace cv; +using namespace std; +using half_float::half; + +typedef half Rpp16f; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 7; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pln1 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + if (atoi(argv[4]) != 0) + { + printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n"); + return -1; + } + + if (atoi(argv[6]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]); + printf("\ncase number (1:7) = %s", argv[5]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + int ip_bitDepth = atoi(argv[3]); + unsigned int outputFormatToggle = atoi(argv[4]); + int test_case = atoi(argv[5]); + + int ip_channel = 1; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PLN1_toPLN1"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 0); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + + Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + + Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 0); + image_second = imread(temp_second, 0); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths + + if (ip_bitDepth == 1) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp16f *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0; + *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0; + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + } + else if (ip_bitDepth == 2) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + } + else if (ip_bitDepth == 5) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + rppCreateWithBatchSize(&handle, noOfImages); + + double max_time_used = 0, min_time_used = 500, avg_time_used = 0; + + string test_case_name; + + printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages); + + for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++) + { + clock_t start, end; + double start_omp, end_omp; + double cpu_time_used, omp_time_used; + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 50; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 199; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 149; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + start_omp = omp_get_wtime(); + start = clock(); + if (ip_bitDepth == 0) + rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + end = clock(); + end_omp = omp_get_wtime(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + omp_time_used = end_omp - start_omp; + if (cpu_time_used > max_time_used) + max_time_used = cpu_time_used; + if (cpu_time_used < min_time_used) + min_time_used = cpu_time_used; + avg_time_used += cpu_time_used; + } + + avg_time_used /= 100; + + // Display measured times + + cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl; + + rppDestroyHost(handle); + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + free(input); + free(input_second); + free(output); + free(inputf16); + free(inputf16_second); + free(outputf16); + free(inputf32); + free(inputf32_second); + free(outputf32); + free(inputi8); + free(inputi8_second); + free(outputi8); + + return 0; +} diff --git a/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp new file mode 100644 index 000000000..3bdd5fdd0 --- /dev/null +++ b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp @@ -0,0 +1,595 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; +using half_float::half; + +typedef half Rpp16f; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 7; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pln3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[6]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]); + printf("\ncase number (1:7) = %s", argv[5]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + int ip_bitDepth = atoi(argv[3]); + unsigned int outputFormatToggle = atoi(argv[4]); + int test_case = atoi(argv[5]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PLN3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NHWC; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + + Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + + Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert default OpenCV PKD3 to PLN3 for first input batch + + Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputTemp, *inputCopyTemp; + inputTemp = input; + inputCopyTemp = inputCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputTempR, *inputTempG, *inputTempB; + inputTempR = inputTemp; + inputTempG = inputTempR + srcDescPtr->strides.cStride; + inputTempB = inputTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputTempR = *inputCopyTemp; + inputCopyTemp++; + inputTempR++; + *inputTempG = *inputCopyTemp; + inputCopyTemp++; + inputTempG++; + *inputTempB = *inputCopyTemp; + inputCopyTemp++; + inputTempB++; + } + } + + inputTemp += srcDescPtr->strides.nStride; + } + + free(inputCopy); + + // Convert default OpenCV PKD3 to PLN3 for second input batch + + Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputSecondTemp, *inputSecondCopyTemp; + inputSecondTemp = input_second; + inputSecondCopyTemp = inputSecondCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB; + inputSecondTempR = inputSecondTemp; + inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride; + inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputSecondTempR = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempR++; + *inputSecondTempG = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempG++; + *inputSecondTempB = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempB++; + } + } + + inputSecondTemp += srcDescPtr->strides.nStride; + } + + free(inputSecondCopy); + + // Convert inputs to test various other bit depths + + if (ip_bitDepth == 1) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp16f *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0; + *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0; + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + } + else if (ip_bitDepth == 2) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + } + else if (ip_bitDepth == 5) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + rppCreateWithBatchSize(&handle, noOfImages); + + double max_time_used = 0, min_time_used = 500, avg_time_used = 0; + + string test_case_name; + + printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages); + + for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++) + { + clock_t start, end; + double start_omp, end_omp; + double cpu_time_used, omp_time_used; + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 50; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 199; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 149; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + start_omp = omp_get_wtime(); + start = clock(); + if (ip_bitDepth == 0) + rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + end = clock(); + end_omp = omp_get_wtime(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + omp_time_used = end_omp - start_omp; + if (cpu_time_used > max_time_used) + max_time_used = cpu_time_used; + if (cpu_time_used < min_time_used) + min_time_used = cpu_time_used; + avg_time_used += cpu_time_used; + } + + avg_time_used /= 100; + + // Display measured times + + cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl; + + rppDestroyHost(handle); + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + free(input); + free(input_second); + free(output); + free(inputf16); + free(inputf16_second); + free(outputf16); + free(inputf32); + free(inputf32_second); + free(outputf32); + free(inputi8); + free(inputi8_second); + free(outputi8); + + return 0; +} diff --git a/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py b/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py index 423b13320..2e47cc255 100644 --- a/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py +++ b/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py @@ -27,7 +27,10 @@ log_file_list = [ "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pkd3_host_raw_performance_log.txt", "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pln3_host_raw_performance_log.txt", - "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pln1_host_raw_performance_log.txt" + "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pln1_host_raw_performance_log.txt", + "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/Tensor_host_pkd3_host_raw_performance_log.txt", + "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/Tensor_host_pln3_host_raw_performance_log.txt", + "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/Tensor_host_pln1_host_raw_performance_log.txt" ] functionality_group_list = [ diff --git a/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh b/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh index f46cfe52b..1763873ae 100755 --- a/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh +++ b/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh @@ -152,6 +152,10 @@ do printf "\n./BatchPD_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_host_pkd3_host_raw_performance_log.txt" + + printf "\n./Tensor_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_host_pkd3_host_raw_performance_log.txt" + echo "------------------------------------------------------------------------------------------" done done @@ -188,6 +192,10 @@ do printf "\n./BatchPD_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_host_pln1_host_raw_performance_log.txt" + + printf "\n./Tensor_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_host_pln1_host_raw_performance_log.txt" + echo "------------------------------------------------------------------------------------------" done done @@ -224,6 +232,10 @@ do printf "\n./BatchPD_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_host_pln3_host_raw_performance_log.txt" + + printf "\n./Tensor_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_host_pln3_host_raw_performance_log.txt" + echo "------------------------------------------------------------------------------------------" done done diff --git a/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt b/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt index bd5254b85..1683839b5 100644 --- a/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt +++ b/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt @@ -23,12 +23,18 @@ if (hip_FOUND) add_executable(BatchPD_hip_pkd3 BatchPD_hip_pkd3.cpp) add_executable(BatchPD_hip_pln1 BatchPD_hip_pln1.cpp) add_executable(BatchPD_hip_pln3 BatchPD_hip_pln3.cpp) + add_executable(Tensor_hip_pkd3 Tensor_hip_pkd3.cpp) + add_executable(Tensor_hip_pln3 Tensor_hip_pln3.cpp) + add_executable(Tensor_hip_pln1 Tensor_hip_pln1.cpp) # add_executable(Single_hip Single_hip.cpp) add_executable(uniqueFunctionalities_hip uniqueFunctionalities_hip.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DHIP_COMPILE=1 -DRPP_BACKEND_HIP=1 -std=c++11") target_link_libraries(BatchPD_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) target_link_libraries(BatchPD_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) target_link_libraries(BatchPD_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) + target_link_libraries(Tensor_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) + target_link_libraries(Tensor_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) + target_link_libraries(Tensor_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) # target_link_libraries(Single_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) target_link_libraries(uniqueFunctionalities_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host) endif() \ No newline at end of file diff --git a/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp new file mode 100644 index 000000000..0bd00b636 --- /dev/null +++ b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp @@ -0,0 +1,831 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 8; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_hip_pkd3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[7]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\ndst = %s", argv[3]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); + printf("\ncase number (1:7) = %s", argv[6]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + char *dst = argv[3]; + int ip_bitDepth = atoi(argv[4]); + unsigned int outputFormatToggle = atoi(argv[5]); + int test_case = atoi(argv[6]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HIP_PKD3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + // outputFormatToggle = 0; + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NHWC; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NCHW; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + printf("\nRunning %s...", func); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + strcat(dst, "/"); + strcat(dst, funcName); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst; + hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI)); + hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + srcDescPtr->c = ip_channel; + + dstDescPtr->n = noOfImages; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + dstDescPtr->c = ip_channel; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w; + srcDescPtr->strides.wStride = ip_channel; + srcDescPtr->strides.cStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize 8u host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += srcDescPtr->strides.hStride; + input_second_temp += srcDescPtr->strides.hStride; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths and copy to hip buffers + + half *inputf16, *inputf16_second, *outputf16; + Rpp32f *inputf32, *inputf32_second, *outputf32; + Rpp8s *inputi8, *inputi8_second, *outputi8; + int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second; + int *d_output, *d_outputf16, *d_outputf32, *d_outputi8; + + if (ip_bitDepth == 0) + { + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 1) + { + inputf16 = (half *)calloc(ioBufferSize, sizeof(half)); + inputf16_second = (half *)calloc(ioBufferSize, sizeof(half)); + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + + Rpp8u *inputTemp, *input_secondTemp; + half *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = (half)(((float)*inputTemp) / 255.0); + *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0); + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + + hipMalloc(&d_inputf16, ioBufferSize * sizeof(half)); + hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 2) + { + inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + + hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 3) + { + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 4) + { + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 5) + { + inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + + hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 6) + { + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + hipStream_t stream; + hipStreamCreate(&stream); + rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages); + + clock_t start, end; + double gpu_time_used; + + string test_case_name; + + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 30; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 210; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 210; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + + hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice); + + start = clock(); + + if (ip_bitDepth == 0) + rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + + end = clock(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + cout << "\nGPU Time - BatchPD : " << gpu_time_used; + printf("\n"); + + // Reconvert other bit depths to 8u for output display purposes + + string fileName = std::to_string(ip_bitDepth); + ofstream outputFile (fileName + ".csv"); + + if (ip_bitDepth == 0) + { + hipMemcpy(output, d_output, oBufferSize * sizeof(Rpp8u), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32u) *outputTemp << ","; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 1) || (ip_bitDepth == 3)) + { + hipMemcpy(outputf16, d_outputf16, oBufferSize * sizeof(half), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + half *outputf16Temp; + outputf16Temp = outputf16; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (char) *outputf16Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK((float)*outputf16Temp * 255.0); + outputf16Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 2) || (ip_bitDepth == 4)) + { + hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + Rpp32f *outputf32Temp; + outputf32Temp = outputf32; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf32Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0); + outputf32Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 5) || (ip_bitDepth == 6)) + { + hipMemcpy(outputi8, d_outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + Rpp8s *outputi8Temp; + outputi8Temp = outputi8; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32s) *outputi8Temp << ","; + *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128); + outputi8Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + + // Calculate exact dstROI in XYWH format for OpenCV dump + + if (roiTypeSrc == RpptRoiType::LTRB) + { + for (int i = 0; i < dstDescPtr->n; i++) + { + int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x; + int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y; + int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x; + int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y; + + roiTensorPtrSrc[i].xywhROI.xy.x = ltX; + roiTensorPtrSrc[i].xywhROI.xy.y = ltY; + roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1; + roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1; + } + } + + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h; + + for (int i = 0; i < dstDescPtr->n; i++) + { + roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth); + roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight); + roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x); + roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y); + } + + // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump + + if (dstDescPtr->layout == RpptLayout::NCHW) + { + Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u)); + + Rpp8u *outputTemp, *outputCopyTemp; + outputTemp = output; + outputCopyTemp = outputCopy; + + for (int count = 0; count < dstDescPtr->n; count++) + { + Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB; + outputCopyTempR = outputCopyTemp; + outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride; + outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride; + + for (int i = 0; i < dstDescPtr->h; i++) + { + for (int j = 0; j < dstDescPtr->w; j++) + { + *outputTemp = *outputCopyTempR; + outputTemp++; + outputCopyTempR++; + *outputTemp = *outputCopyTempG; + outputTemp++; + outputCopyTempG++; + *outputTemp = *outputCopyTempB; + outputTemp++; + outputCopyTempB++; + } + } + + outputCopyTemp += dstDescPtr->strides.nStride; + } + + free(outputCopy); + } + + rppDestroyGPU(handle); + + // OpenCV dump + + mkdir(dst, 0700); + strcat(dst, "/"); + count = 0; + + for (j = 0; j < dstDescPtr->n; j++) + { + int height = roiTensorPtrSrc[j].xywhROI.roiHeight; + int width = roiTensorPtrSrc[j].xywhROI.roiWidth; + + int op_size = height * width * ip_channel; + Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u)); + Rpp8u *temp_output_row; + temp_output_row = temp_output; + Rpp32u elementsInRow = width * ip_channel; + Rpp8u *output_row = output + count; + + for (int k = 0; k < height; k++) + { + memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u)); + temp_output_row += elementsInRow; + output_row += srcDescPtr->strides.hStride; + } + count += dstDescPtr->strides.nStride; + + char temp[1000]; + strcpy(temp, dst); + strcat(temp, imageNames[j]); + + Mat mat_op_image; + mat_op_image = Mat(height, width, CV_8UC3, temp_output); + imwrite(temp, mat_op_image); + + free(temp_output); + } + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + hipFree(d_roiTensorPtrSrc); + hipFree(d_roiTensorPtrDst); + free(input); + free(input_second); + free(output); + + if (ip_bitDepth == 0) + { + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_output); + } + else if (ip_bitDepth == 1) + { + free(inputf16); + free(inputf16_second); + free(outputf16); + hipFree(d_inputf16); + hipFree(d_inputf16_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 2) + { + free(inputf32); + free(inputf32_second); + free(outputf32); + hipFree(d_inputf32); + hipFree(d_inputf32_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 3) + { + free(outputf16); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 4) + { + free(outputf32); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 5) + { + free(inputi8); + free(inputi8_second); + free(outputi8); + hipFree(d_inputi8); + hipFree(d_inputi8_second); + hipFree(d_outputi8); + } + else if (ip_bitDepth == 6) + { + free(outputi8); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputi8); + } + + return 0; +} diff --git a/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp new file mode 100644 index 000000000..b763a62bf --- /dev/null +++ b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp @@ -0,0 +1,789 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include +#include "helpers/testSuite_helper.hpp" + +using namespace cv; +using namespace std; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 8; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_hip_pln1 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + if (atoi(argv[5]) != 0) + { + printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n"); + return -1; + } + + if (atoi(argv[7]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\ndst = %s", argv[3]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); + printf("\ncase number (1:7) = %s", argv[6]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + char *dst = argv[3]; + int ip_bitDepth = atoi(argv[4]); + unsigned int outputFormatToggle = atoi(argv[5]); + int test_case = atoi(argv[6]); + + int ip_channel = 1; + + // Set case names + + char funcType[1000] = {"Tensor_HIP_PLN1_toPLN1"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + outputFormatToggle = 0; + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + printf("\nRunning %s...", func); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + strcat(dst, "/"); + strcat(dst, funcName); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst; + hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI)); + hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 0); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 0); + image_second = imread(temp_second, 0); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths and copy to hip buffers + + half *inputf16, *inputf16_second, *outputf16; + Rpp32f *inputf32, *inputf32_second, *outputf32; + Rpp8s *inputi8, *inputi8_second, *outputi8; + int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second; + int *d_output, *d_outputf16, *d_outputf32, *d_outputi8; + + if (ip_bitDepth == 0) + { + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 1) + { + inputf16 = (half *)calloc(ioBufferSize, sizeof(half)); + inputf16_second = (half *)calloc(ioBufferSize, sizeof(half)); + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + + Rpp8u *inputTemp, *input_secondTemp; + half *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = (half)(((float)*inputTemp) / 255.0); + *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0); + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + + hipMalloc(&d_inputf16, ioBufferSize * sizeof(half)); + hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 2) + { + inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + + hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 3) + { + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 4) + { + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 5) + { + inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + + hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 6) + { + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + hipStream_t stream; + hipStreamCreate(&stream); + rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages); + + clock_t start, end; + double gpu_time_used; + + string test_case_name; + + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 30; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 210; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 210; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice); + + start = clock(); + + if (ip_bitDepth == 0) + rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + + end = clock(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + cout << "\nGPU Time - BatchPD : " << gpu_time_used; + printf("\n"); + + // Reconvert other bit depths to 8u for output display purposes + + string fileName = std::to_string(ip_bitDepth); + ofstream outputFile (fileName + ".csv"); + + if (ip_bitDepth == 0) + { + hipMemcpy(output, d_output, oBufferSize * sizeof(Rpp8u), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32u) *outputTemp << ","; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 1) || (ip_bitDepth == 3)) + { + hipMemcpy(outputf16, d_outputf16, oBufferSize * sizeof(half), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + half *outputf16Temp; + outputf16Temp = outputf16; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (char) *outputf16Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK((float)*outputf16Temp * 255.0); + outputf16Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 2) || (ip_bitDepth == 4)) + { + hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + Rpp32f *outputf32Temp; + outputf32Temp = outputf32; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf32Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0); + outputf32Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 5) || (ip_bitDepth == 6)) + { + hipMemcpy(outputi8, d_outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + Rpp8s *outputi8Temp; + outputi8Temp = outputi8; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32s) *outputi8Temp << ","; + *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128); + outputi8Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + + // Calculate exact dstROI in XYWH format for OpenCV dump + + if (roiTypeSrc == RpptRoiType::LTRB) + { + for (int i = 0; i < dstDescPtr->n; i++) + { + int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x; + int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y; + int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x; + int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y; + + roiTensorPtrSrc[i].xywhROI.xy.x = ltX; + roiTensorPtrSrc[i].xywhROI.xy.y = ltY; + roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1; + roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1; + } + } + + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h; + + for (int i = 0; i < dstDescPtr->n; i++) + { + roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth); + roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight); + roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x); + roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y); + } + + rppDestroyGPU(handle); + + // OpenCV dump + + mkdir(dst, 0700); + strcat(dst, "/"); + count = 0; + elementsInRowMax = dstDescPtr->w * ip_channel; + + for (j = 0; j < dstDescPtr->n; j++) + { + int height = roiTensorPtrSrc[j].xywhROI.roiHeight; + int width = roiTensorPtrSrc[j].xywhROI.roiWidth; + + int op_size = height * width * ip_channel; + Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u)); + Rpp8u *temp_output_row; + temp_output_row = temp_output; + Rpp32u elementsInRow = width * ip_channel; + Rpp8u *output_row = output + count; + + for (int k = 0; k < height; k++) + { + memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u)); + temp_output_row += elementsInRow; + output_row += elementsInRowMax; + } + count += dstDescPtr->strides.nStride; + + char temp[1000]; + strcpy(temp, dst); + strcat(temp, imageNames[j]); + + Mat mat_op_image; + mat_op_image = Mat(height, width, CV_8UC1, temp_output); + imwrite(temp, mat_op_image); + + free(temp_output); + } + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + hipFree(d_roiTensorPtrSrc); + hipFree(d_roiTensorPtrDst); + free(input); + free(input_second); + free(output); + + if (ip_bitDepth == 0) + { + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_output); + } + else if (ip_bitDepth == 1) + { + free(inputf16); + free(inputf16_second); + free(outputf16); + hipFree(d_inputf16); + hipFree(d_inputf16_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 2) + { + free(inputf32); + free(inputf32_second); + free(outputf32); + hipFree(d_inputf32); + hipFree(d_inputf32_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 3) + { + free(outputf16); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 4) + { + free(outputf32); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 5) + { + free(inputi8); + free(inputi8_second); + free(outputi8); + hipFree(d_inputi8); + hipFree(d_inputi8_second); + hipFree(d_outputi8); + } + else if (ip_bitDepth == 6) + { + free(outputi8); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputi8); + } + + return 0; +} diff --git a/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp new file mode 100644 index 000000000..e7aaed7c1 --- /dev/null +++ b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp @@ -0,0 +1,907 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 8; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_hip_pln3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[7]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\ndst = %s", argv[3]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); + printf("\ncase number (1:7) = %s", argv[6]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + char *dst = argv[3]; + int ip_bitDepth = atoi(argv[4]); + unsigned int outputFormatToggle = atoi(argv[5]); + int test_case = atoi(argv[6]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HIP_PLN3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + // outputFormatToggle = 0; + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NHWC; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + printf("\nRunning %s...", func); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + strcat(dst, "/"); + strcat(dst, funcName); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst; + hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI)); + hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert default OpenCV PKD3 to PLN3 for first input batch + + Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputTemp, *inputCopyTemp; + inputTemp = input; + inputCopyTemp = inputCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputTempR, *inputTempG, *inputTempB; + inputTempR = inputTemp; + inputTempG = inputTempR + srcDescPtr->strides.cStride; + inputTempB = inputTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputTempR = *inputCopyTemp; + inputCopyTemp++; + inputTempR++; + *inputTempG = *inputCopyTemp; + inputCopyTemp++; + inputTempG++; + *inputTempB = *inputCopyTemp; + inputCopyTemp++; + inputTempB++; + } + } + + inputTemp += srcDescPtr->strides.nStride; + } + + free(inputCopy); + + // Convert default OpenCV PKD3 to PLN3 for second input batch + + Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputSecondTemp, *inputSecondCopyTemp; + inputSecondTemp = input_second; + inputSecondCopyTemp = inputSecondCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB; + inputSecondTempR = inputSecondTemp; + inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride; + inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputSecondTempR = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempR++; + *inputSecondTempG = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempG++; + *inputSecondTempB = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempB++; + } + } + + inputSecondTemp += srcDescPtr->strides.nStride; + } + + free(inputSecondCopy); + + // Convert inputs to test various other bit depths and copy to hip buffers + + half *inputf16, *inputf16_second, *outputf16; + Rpp32f *inputf32, *inputf32_second, *outputf32; + Rpp8s *inputi8, *inputi8_second, *outputi8; + int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second; + int *d_output, *d_outputf16, *d_outputf32, *d_outputi8; + + if (ip_bitDepth == 0) + { + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 1) + { + inputf16 = (half *)calloc(ioBufferSize, sizeof(half)); + inputf16_second = (half *)calloc(ioBufferSize, sizeof(half)); + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + + Rpp8u *inputTemp, *input_secondTemp; + half *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = (half)(((float)*inputTemp) / 255.0); + *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0); + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + + hipMalloc(&d_inputf16, ioBufferSize * sizeof(half)); + hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 2) + { + inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + + hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 3) + { + outputf16 = (half *)calloc(oBufferSize, sizeof(half)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf16, oBufferSize * sizeof(half)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 4) + { + outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 5) + { + inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + + hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + else if (ip_bitDepth == 6) + { + outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s)); + hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u)); + hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s)); + hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice); + hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice); + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + hipStream_t stream; + hipStreamCreate(&stream); + rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages); + + clock_t start, end; + double gpu_time_used; + + string test_case_name; + + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 30; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 210; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 210; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice); + + start = clock(); + + if (ip_bitDepth == 0) + rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + + end = clock(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + cout << "\nGPU Time - BatchPD : " << gpu_time_used; + printf("\n"); + + // Reconvert other bit depths to 8u for output display purposes + + string fileName = std::to_string(ip_bitDepth); + ofstream outputFile (fileName + ".csv"); + + if (ip_bitDepth == 0) + { + hipMemcpy(output, d_output, oBufferSize * sizeof(Rpp8u), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32u) *outputTemp << ","; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 1) || (ip_bitDepth == 3)) + { + hipMemcpy(outputf16, d_outputf16, oBufferSize * sizeof(half), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + half *outputf16Temp; + outputf16Temp = outputf16; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (char) *outputf16Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK((float)*outputf16Temp * 255.0); + outputf16Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 2) || (ip_bitDepth == 4)) + { + hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + Rpp32f *outputf32Temp; + outputf32Temp = outputf32; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf32Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0); + outputf32Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 5) || (ip_bitDepth == 6)) + { + hipMemcpy(outputi8, d_outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyDeviceToHost); + Rpp8u *outputTemp; + outputTemp = output; + Rpp8s *outputi8Temp; + outputi8Temp = outputi8; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32s) *outputi8Temp << ","; + *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128); + outputi8Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + + // Calculate exact dstROI in XYWH format for OpenCV dump + + if (roiTypeSrc == RpptRoiType::LTRB) + { + for (int i = 0; i < dstDescPtr->n; i++) + { + int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x; + int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y; + int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x; + int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y; + + roiTensorPtrSrc[i].xywhROI.xy.x = ltX; + roiTensorPtrSrc[i].xywhROI.xy.y = ltY; + roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1; + roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1; + } + } + + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h; + + for (int i = 0; i < dstDescPtr->n; i++) + { + roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth); + roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight); + roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x); + roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y); + } + + // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump + + if (dstDescPtr->layout == RpptLayout::NCHW) + { + Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u)); + + Rpp8u *outputTemp, *outputCopyTemp; + outputTemp = output; + outputCopyTemp = outputCopy; + + for (int count = 0; count < dstDescPtr->n; count++) + { + Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB; + outputCopyTempR = outputCopyTemp; + outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride; + outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride; + + for (int i = 0; i < dstDescPtr->h; i++) + { + for (int j = 0; j < dstDescPtr->w; j++) + { + *outputTemp = *outputCopyTempR; + outputTemp++; + outputCopyTempR++; + *outputTemp = *outputCopyTempG; + outputTemp++; + outputCopyTempG++; + *outputTemp = *outputCopyTempB; + outputTemp++; + outputCopyTempB++; + } + } + + outputCopyTemp += dstDescPtr->strides.nStride; + } + + free(outputCopy); + } + + rppDestroyGPU(handle); + + // OpenCV dump + + mkdir(dst, 0700); + strcat(dst, "/"); + count = 0; + elementsInRowMax = dstDescPtr->w * ip_channel; + + for (j = 0; j < dstDescPtr->n; j++) + { + int height = roiTensorPtrSrc[j].xywhROI.roiHeight; + int width = roiTensorPtrSrc[j].xywhROI.roiWidth; + + int op_size = height * width * ip_channel; + Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u)); + Rpp8u *temp_output_row; + temp_output_row = temp_output; + Rpp32u elementsInRow = width * ip_channel; + Rpp8u *output_row = output + count; + + for (int k = 0; k < height; k++) + { + memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u)); + temp_output_row += elementsInRow; + output_row += elementsInRowMax; + } + count += dstDescPtr->strides.nStride; + + char temp[1000]; + strcpy(temp, dst); + strcat(temp, imageNames[j]); + + Mat mat_op_image; + mat_op_image = Mat(height, width, CV_8UC3, temp_output); + imwrite(temp, mat_op_image); + + free(temp_output); + } + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + hipFree(d_roiTensorPtrSrc); + hipFree(d_roiTensorPtrDst); + free(input); + free(input_second); + free(output); + + if (ip_bitDepth == 0) + { + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_output); + } + else if (ip_bitDepth == 1) + { + free(inputf16); + free(inputf16_second); + free(outputf16); + hipFree(d_inputf16); + hipFree(d_inputf16_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 2) + { + free(inputf32); + free(inputf32_second); + free(outputf32); + hipFree(d_inputf32); + hipFree(d_inputf32_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 3) + { + free(outputf16); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf16); + } + else if (ip_bitDepth == 4) + { + free(outputf32); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputf32); + } + else if (ip_bitDepth == 5) + { + free(inputi8); + free(inputi8_second); + free(outputi8); + hipFree(d_inputi8); + hipFree(d_inputi8_second); + hipFree(d_outputi8); + } + else if (ip_bitDepth == 6) + { + free(outputi8); + hipFree(d_input); + hipFree(d_input_second); + hipFree(d_outputi8); + } + + return 0; +} diff --git a/utilities/rpp-unittests/HIP_NEW/testAllScript.sh b/utilities/rpp-unittests/HIP_NEW/testAllScript.sh index f39ed8bfb..0faaa34c7 100755 --- a/utilities/rpp-unittests/HIP_NEW/testAllScript.sh +++ b/utilities/rpp-unittests/HIP_NEW/testAllScript.sh @@ -196,6 +196,10 @@ do printf "\n./BatchPD_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + + printf "\n./Tensor_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + echo "------------------------------------------------------------------------------------------" done done @@ -252,6 +256,10 @@ do printf "\n./BatchPD_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + + printf "\n./Tensor_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + echo "------------------------------------------------------------------------------------------" done done @@ -308,6 +316,10 @@ do printf "\n./BatchPD_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + + printf "\n./Tensor_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + echo "------------------------------------------------------------------------------------------" done done diff --git a/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt b/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt index 15d7f51ca..0454e3c13 100644 --- a/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt +++ b/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt @@ -7,7 +7,7 @@ find_package(OpenCV REQUIRED) find_package(AMDRPP QUIET) if(NOT OpenCL_FOUND) - message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}") + message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}") endif() if (OpenCL_FOUND) @@ -19,12 +19,18 @@ if (OpenCL_FOUND) add_executable(BatchPD_host_pkd3 BatchPD_host_pkd3.cpp) add_executable(BatchPD_host_pln1 BatchPD_host_pln1.cpp) add_executable(BatchPD_host_pln3 BatchPD_host_pln3.cpp) + add_executable(Tensor_host_pkd3 Tensor_host_pkd3.cpp) + add_executable(Tensor_host_pln3 Tensor_host_pln3.cpp) + add_executable(Tensor_host_pln1 Tensor_host_pln1.cpp) # add_executable(Single_host Single_host.cpp) add_executable(uniqueFunctionalities_host uniqueFunctionalities_host.cpp) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DOCL_COMPILE=1 -DRPP_BACKEND_OPENCL=1 -std=c++11") target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) + target_link_libraries(Tensor_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) + target_link_libraries(Tensor_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) + target_link_libraries(Tensor_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) # target_link_libraries(Single_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system ) target_link_libraries(uniqueFunctionalities_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system) endif() \ No newline at end of file diff --git a/utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp new file mode 100644 index 000000000..2ef7860c2 --- /dev/null +++ b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp @@ -0,0 +1,710 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; +using half_float::half; + +typedef half Rpp16f; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 8; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pkd3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[7]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\ndst = %s", argv[3]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); + printf("\ncase number (1:7) = %s", argv[6]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + char *dst = argv[3]; + int ip_bitDepth = atoi(argv[4]); + unsigned int outputFormatToggle = atoi(argv[5]); + int test_case = atoi(argv[6]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PKD3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NHWC; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NHWC; + dstDescPtr->layout = RpptLayout::NCHW; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + printf("\nRunning %s...", func); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + strcat(dst, "/"); + strcat(dst, funcName); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + srcDescPtr->c = ip_channel; + + dstDescPtr->n = noOfImages; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + dstDescPtr->c = ip_channel; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w; + srcDescPtr->strides.wStride = ip_channel; + srcDescPtr->strides.cStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + + Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + + Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += srcDescPtr->strides.hStride; + input_second_temp += srcDescPtr->strides.hStride; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths + + if (ip_bitDepth == 1) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp16f *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0; + *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0; + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + } + else if (ip_bitDepth == 2) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + } + else if (ip_bitDepth == 5) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + rppCreateWithBatchSize(&handle, noOfImages); + clock_t start, end; + double start_omp, end_omp; + double cpu_time_used, omp_time_used; + + string test_case_name; + + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 50; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 199; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 149; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + start_omp = omp_get_wtime(); + start = clock(); + if (ip_bitDepth == 0) + rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + end = clock(); + end_omp = omp_get_wtime(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + omp_time_used = end_omp - start_omp; + cout << "\nCPU Time - BatchPD : " << cpu_time_used; + cout << "\nOMP Time - BatchPD : " << omp_time_used; + printf("\n"); + + // Reconvert other bit depths to 8u for output display purposes + + string fileName = std::to_string(ip_bitDepth); + ofstream outputFile (fileName + ".csv"); + + if (ip_bitDepth == 0) + { + Rpp8u *outputTemp; + outputTemp = output; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32u) *outputTemp << ","; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + + } + else if ((ip_bitDepth == 1) || (ip_bitDepth == 3)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp16f *outputf16Temp; + outputf16Temp = outputf16; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf16Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf16Temp * 255.0); + outputf16Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + + } + else if ((ip_bitDepth == 2) || (ip_bitDepth == 4)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp32f *outputf32Temp; + outputf32Temp = outputf32; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf32Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0); + outputf32Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 5) || (ip_bitDepth == 6)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp8s *outputi8Temp; + outputi8Temp = outputi8; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32s) *outputi8Temp << ","; + *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128); + outputi8Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + + // Calculate exact dstROI in XYWH format for OpenCV dump + + if (roiTypeSrc == RpptRoiType::LTRB) + { + for (int i = 0; i < dstDescPtr->n; i++) + { + int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x; + int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y; + int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x; + int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y; + + roiTensorPtrSrc[i].xywhROI.xy.x = ltX; + roiTensorPtrSrc[i].xywhROI.xy.y = ltY; + roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1; + roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1; + } + } + + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h; + + for (int i = 0; i < dstDescPtr->n; i++) + { + roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth); + roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight); + roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x); + roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y); + } + + // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump + + if (dstDescPtr->layout == RpptLayout::NCHW) + { + Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u)); + + Rpp8u *outputTemp, *outputCopyTemp; + outputTemp = output; + outputCopyTemp = outputCopy; + + for (int count = 0; count < dstDescPtr->n; count++) + { + Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB; + outputCopyTempR = outputCopyTemp; + outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride; + outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride; + + for (int i = 0; i < dstDescPtr->h; i++) + { + for (int j = 0; j < dstDescPtr->w; j++) + { + *outputTemp = *outputCopyTempR; + outputTemp++; + outputCopyTempR++; + *outputTemp = *outputCopyTempG; + outputTemp++; + outputCopyTempG++; + *outputTemp = *outputCopyTempB; + outputTemp++; + outputCopyTempB++; + } + } + + outputCopyTemp += dstDescPtr->strides.nStride; + } + + free(outputCopy); + } + + rppDestroyHost(handle); + + // OpenCV dump + + mkdir(dst, 0700); + strcat(dst, "/"); + count = 0; + + for (j = 0; j < dstDescPtr->n; j++) + { + int height = roiTensorPtrSrc[j].xywhROI.roiHeight; + int width = roiTensorPtrSrc[j].xywhROI.roiWidth; + + int op_size = height * width * ip_channel; + Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u)); + Rpp8u *temp_output_row; + temp_output_row = temp_output; + Rpp32u elementsInRow = width * ip_channel; + Rpp8u *output_row = output + count; + + for (int k = 0; k < height; k++) + { + memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u)); + temp_output_row += elementsInRow; + output_row += srcDescPtr->strides.hStride; + } + count += dstDescPtr->strides.nStride; + + char temp[1000]; + strcpy(temp, dst); + strcat(temp, imageNames[j]); + + Mat mat_op_image; + mat_op_image = Mat(height, width, CV_8UC3, temp_output); + imwrite(temp, mat_op_image); + + free(temp_output); + } + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + free(input); + free(input_second); + free(output); + free(inputf16); + free(inputf16_second); + free(outputf16); + free(inputf32); + free(inputf32_second); + free(outputf32); + free(inputi8); + free(inputi8_second); + free(outputi8); + + return 0; +} diff --git a/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp new file mode 100644 index 000000000..66019ecf4 --- /dev/null +++ b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp @@ -0,0 +1,669 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include +#include "helpers/testSuite_helper.hpp" + +using namespace cv; +using namespace std; +using half_float::half; + +typedef half Rpp16f; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 8; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pln1 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + if (atoi(argv[5]) != 0) + { + printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n"); + return -1; + } + + if (atoi(argv[7]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\ndst = %s", argv[3]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); + printf("\ncase number (1:7) = %s", argv[6]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + char *dst = argv[3]; + int ip_bitDepth = atoi(argv[4]); + unsigned int outputFormatToggle = atoi(argv[5]); + int test_case = atoi(argv[6]); + + int ip_channel = 1; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PLN1_toPLN1"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + printf("\nRunning %s...", func); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + strcat(dst, "/"); + strcat(dst, funcName); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 0); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + + Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + + Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 0); + image_second = imread(temp_second, 0); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert inputs to test various other bit depths + + if (ip_bitDepth == 1) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp16f *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0; + *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0; + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + } + else if (ip_bitDepth == 2) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + } + else if (ip_bitDepth == 5) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + rppCreateWithBatchSize(&handle, noOfImages); + clock_t start, end; + double start_omp, end_omp; + double cpu_time_used, omp_time_used; + + string test_case_name; + + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 50; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 199; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 149; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + start_omp = omp_get_wtime(); + start = clock(); + if (ip_bitDepth == 0) + rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + end = clock(); + end_omp = omp_get_wtime(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + omp_time_used = end_omp - start_omp; + cout << "\nCPU Time - BatchPD : " << cpu_time_used; + cout << "\nOMP Time - BatchPD : " << omp_time_used; + printf("\n"); + + // Reconvert other bit depths to 8u for output display purposes + + string fileName = std::to_string(ip_bitDepth); + ofstream outputFile (fileName + ".csv"); + + if (ip_bitDepth == 0) + { + Rpp8u *outputTemp; + outputTemp = output; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32u) *outputTemp << ","; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + + } + else if ((ip_bitDepth == 1) || (ip_bitDepth == 3)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp16f *outputf16Temp; + outputf16Temp = outputf16; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf16Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf16Temp * 255.0); + outputf16Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + + } + else if ((ip_bitDepth == 2) || (ip_bitDepth == 4)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp32f *outputf32Temp; + outputf32Temp = outputf32; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf32Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0); + outputf32Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 5) || (ip_bitDepth == 6)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp8s *outputi8Temp; + outputi8Temp = outputi8; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32s) *outputi8Temp << ","; + *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128); + outputi8Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + + // Calculate exact dstROI in XYWH format for OpenCV dump + + if (roiTypeSrc == RpptRoiType::LTRB) + { + for (int i = 0; i < dstDescPtr->n; i++) + { + int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x; + int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y; + int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x; + int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y; + + roiTensorPtrSrc[i].xywhROI.xy.x = ltX; + roiTensorPtrSrc[i].xywhROI.xy.y = ltY; + roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1; + roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1; + } + } + + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h; + + for (int i = 0; i < dstDescPtr->n; i++) + { + roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth); + roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight); + roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x); + roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y); + } + + rppDestroyHost(handle); + + // OpenCV dump + + mkdir(dst, 0700); + strcat(dst, "/"); + count = 0; + elementsInRowMax = dstDescPtr->w * ip_channel; + + for (j = 0; j < dstDescPtr->n; j++) + { + int height = roiTensorPtrSrc[j].xywhROI.roiHeight; + int width = roiTensorPtrSrc[j].xywhROI.roiWidth; + + int op_size = height * width * ip_channel; + Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u)); + Rpp8u *temp_output_row; + temp_output_row = temp_output; + Rpp32u elementsInRow = width * ip_channel; + Rpp8u *output_row = output + count; + + for (int k = 0; k < height; k++) + { + memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u)); + temp_output_row += elementsInRow; + output_row += elementsInRowMax; + } + count += dstDescPtr->strides.nStride; + + char temp[1000]; + strcpy(temp, dst); + strcat(temp, imageNames[j]); + + Mat mat_op_image; + mat_op_image = Mat(height, width, CV_8UC1, temp_output); + imwrite(temp, mat_op_image); + + free(temp_output); + } + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + free(input); + free(input_second); + free(output); + free(inputf16); + free(inputf16_second); + free(outputf16); + free(inputf32); + free(inputf32_second); + free(outputf32); + free(inputi8); + free(inputi8_second); + free(outputi8); + + return 0; +} diff --git a/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp new file mode 100644 index 000000000..13435e7fa --- /dev/null +++ b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp @@ -0,0 +1,787 @@ +#include +#include +#include +#include +#include +#include +#include +#include "/opt/rocm/rpp/include/rpp.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace cv; +using namespace std; +using half_float::half; + +typedef half Rpp16f; + +#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255)) +#define RPPMAX2(a,b) ((a > b) ? a : b) +#define RPPMIN2(a,b) ((a < b) ? a : b) + +int main(int argc, char **argv) +{ + // Handle inputs + + const int MIN_ARG_COUNT = 8; + + if (argc < MIN_ARG_COUNT) + { + printf("\nImproper Usage! Needs all arguments!\n"); + printf("\nUsage: ./Tensor_host_pln3 f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> pkd = 0 / pkd->pln = 1)> \n"); + return -1; + } + + if (atoi(argv[7]) == 1) + { + printf("\nInputs for this test case are:"); + printf("\nsrc1 = %s", argv[1]); + printf("\nsrc2 = %s", argv[2]); + printf("\ndst = %s", argv[3]); + printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]); + printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]); + printf("\ncase number (1:7) = %s", argv[6]); + } + + char *src = argv[1]; + char *src_second = argv[2]; + char *dst = argv[3]; + int ip_bitDepth = atoi(argv[4]); + unsigned int outputFormatToggle = atoi(argv[5]); + int test_case = atoi(argv[6]); + + int ip_channel = 3; + + // Set case names + + char funcType[1000] = {"Tensor_HOST_PLN3"}; + + char funcName[1000]; + switch (test_case) + { + case 0: + strcpy(funcName, "brightness"); + break; + } + + // Initialize tensor descriptors + + RpptDesc srcDesc, dstDesc; + RpptDescPtr srcDescPtr, dstDescPtr; + srcDescPtr = &srcDesc; + dstDescPtr = &dstDesc; + + // Set src/dst layouts in tensor descriptors + + if (outputFormatToggle == 0) + { + strcat(funcType, "_toPLN3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NCHW; + } + else if (outputFormatToggle == 1) + { + strcat(funcType, "_toPKD3"); + srcDescPtr->layout = RpptLayout::NCHW; + dstDescPtr->layout = RpptLayout::NHWC; + } + + // Set src/dst data types in tensor descriptors + + if (ip_bitDepth == 0) + { + strcat(funcName, "_u8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::U8; + } + else if (ip_bitDepth == 1) + { + strcat(funcName, "_f16_"); + srcDescPtr->dataType = RpptDataType::F16; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 2) + { + strcat(funcName, "_f32_"); + srcDescPtr->dataType = RpptDataType::F32; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 3) + { + strcat(funcName, "_u8_f16_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F16; + } + else if (ip_bitDepth == 4) + { + strcat(funcName, "_u8_f32_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::F32; + } + else if (ip_bitDepth == 5) + { + strcat(funcName, "_i8_"); + srcDescPtr->dataType = RpptDataType::I8; + dstDescPtr->dataType = RpptDataType::I8; + } + else if (ip_bitDepth == 6) + { + strcat(funcName, "_u8_i8_"); + srcDescPtr->dataType = RpptDataType::U8; + dstDescPtr->dataType = RpptDataType::I8; + } + + // Other initializations + + int missingFuncFlag = 0; + int i = 0, j = 0; + int maxHeight = 0, maxWidth = 0; + int maxDstHeight = 0, maxDstWidth = 0; + unsigned long long count = 0; + unsigned long long ioBufferSize = 0; + unsigned long long oBufferSize = 0; + static int noOfImages = 0; + Mat image, image_second; + + // String ops on function name + + char func[1000]; + strcpy(func, funcName); + strcat(func, funcType); + printf("\nRunning %s...", func); + + char src1[1000]; + strcpy(src1, src); + strcat(src1, "/"); + + char src1_second[1000]; + strcpy(src1_second, src_second); + strcat(src1_second, "/"); + + strcat(funcName, funcType); + strcat(dst, "/"); + strcat(dst, funcName); + + // Get number of images + + struct dirent *de; + DIR *dr = opendir(src); + while ((de = readdir(dr)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + noOfImages += 1; + } + closedir(dr); + + // Initialize ROI tensors for src/dst + + RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI)); + + // Set ROI tensors types for src/dst + + RpptRoiType roiTypeSrc, roiTypeDst; + roiTypeSrc = RpptRoiType::XYWH; + roiTypeDst = RpptRoiType::XYWH; + + // Set maxHeight, maxWidth and ROIs for src/dst + + const int images = noOfImages; + char imageNames[images][1000]; + + DIR *dr1 = opendir(src); + while ((de = readdir(dr1)) != NULL) + { + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + strcpy(imageNames[count], de->d_name); + char temp[1000]; + strcpy(temp, src1); + strcat(temp, imageNames[count]); + + image = imread(temp, 1); + + roiTensorPtrSrc[count].xywhROI.xy.x = 0; + roiTensorPtrSrc[count].xywhROI.xy.y = 0; + roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols; + roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows; + + roiTensorPtrDst[count].xywhROI.xy.x = 0; + roiTensorPtrDst[count].xywhROI.xy.y = 0; + roiTensorPtrDst[count].xywhROI.roiWidth = image.cols; + roiTensorPtrDst[count].xywhROI.roiHeight = image.rows; + + maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight); + maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth); + maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight); + maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth); + + count++; + } + closedir(dr1); + + // Set numDims, offset, n/c/h/w values for src/dst + + srcDescPtr->numDims = 4; + dstDescPtr->numDims = 4; + + srcDescPtr->offset = 0; + dstDescPtr->offset = 0; + + srcDescPtr->n = noOfImages; + srcDescPtr->c = ip_channel; + srcDescPtr->h = maxHeight; + srcDescPtr->w = maxWidth; + + dstDescPtr->n = noOfImages; + dstDescPtr->c = ip_channel; + dstDescPtr->h = maxDstHeight; + dstDescPtr->w = maxDstWidth; + + // Optionally set w stride as a multiple of 8 for src/dst + + srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8; + dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8; + + // Set n/c/h/w strides for src/dst + + srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h; + srcDescPtr->strides.hStride = srcDescPtr->w; + srcDescPtr->strides.wStride = 1; + + if (dstDescPtr->layout == RpptLayout::NHWC) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w; + dstDescPtr->strides.wStride = ip_channel; + dstDescPtr->strides.cStride = 1; + } + else if (dstDescPtr->layout == RpptLayout::NCHW) + { + dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h; + dstDescPtr->strides.hStride = dstDescPtr->w; + dstDescPtr->strides.wStride = 1; + } + + // Set buffer sizes for src/dst + + ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages; + + // Initialize host buffers for src/dst + + Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + + Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f)); + + Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f)); + + Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s)); + + // Set 8u host buffers for src/dst + + DIR *dr2 = opendir(src); + DIR *dr2_second = opendir(src_second); + count = 0; + i = 0; + + Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel; + + while ((de = readdir(dr2)) != NULL) + { + Rpp8u *input_temp, *input_second_temp; + input_temp = input + (i * srcDescPtr->strides.nStride); + input_second_temp = input_second + (i * srcDescPtr->strides.nStride); + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + char temp[1000]; + strcpy(temp, src1); + strcat(temp, de->d_name); + + char temp_second[1000]; + strcpy(temp_second, src1_second); + strcat(temp_second, de->d_name); + + image = imread(temp, 1); + image_second = imread(temp_second, 1); + + Rpp8u *ip_image = image.data; + Rpp8u *ip_image_second = image_second.data; + + Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel; + + for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++) + { + memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u)); + memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u)); + ip_image += elementsInRow; + ip_image_second += elementsInRow; + input_temp += elementsInRowMax; + input_second_temp += elementsInRowMax; + } + i++; + count += srcDescPtr->strides.nStride; + } + closedir(dr2); + + // Convert default OpenCV PKD3 to PLN3 for first input batch + + Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputTemp, *inputCopyTemp; + inputTemp = input; + inputCopyTemp = inputCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputTempR, *inputTempG, *inputTempB; + inputTempR = inputTemp; + inputTempG = inputTempR + srcDescPtr->strides.cStride; + inputTempB = inputTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputTempR = *inputCopyTemp; + inputCopyTemp++; + inputTempR++; + *inputTempG = *inputCopyTemp; + inputCopyTemp++; + inputTempG++; + *inputTempB = *inputCopyTemp; + inputCopyTemp++; + inputTempB++; + } + } + + inputTemp += srcDescPtr->strides.nStride; + } + + free(inputCopy); + + // Convert default OpenCV PKD3 to PLN3 for second input batch + + Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u)); + memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u)); + + Rpp8u *inputSecondTemp, *inputSecondCopyTemp; + inputSecondTemp = input_second; + inputSecondCopyTemp = inputSecondCopy; + + for (int count = 0; count < noOfImages; count++) + { + Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB; + inputSecondTempR = inputSecondTemp; + inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride; + inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride; + + for (int i = 0; i < srcDescPtr->h; i++) + { + for (int j = 0; j < srcDescPtr->w; j++) + { + *inputSecondTempR = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempR++; + *inputSecondTempG = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempG++; + *inputSecondTempB = *inputSecondCopyTemp; + inputSecondCopyTemp++; + inputSecondTempB++; + } + } + + inputSecondTemp += srcDescPtr->strides.nStride; + } + + free(inputSecondCopy); + + // Convert inputs to test various other bit depths + + if (ip_bitDepth == 1) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp16f *inputf16Temp, *inputf16_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf16Temp = inputf16; + inputf16_secondTemp = inputf16_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0; + *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0; + inputTemp++; + inputf16Temp++; + input_secondTemp++; + inputf16_secondTemp++; + } + } + else if (ip_bitDepth == 2) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp32f *inputf32Temp, *inputf32_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputf32Temp = inputf32; + inputf32_secondTemp = inputf32_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0; + *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0; + inputTemp++; + inputf32Temp++; + input_secondTemp++; + inputf32_secondTemp++; + } + } + else if (ip_bitDepth == 5) + { + Rpp8u *inputTemp, *input_secondTemp; + Rpp8s *inputi8Temp, *inputi8_secondTemp; + + inputTemp = input; + input_secondTemp = input_second; + + inputi8Temp = inputi8; + inputi8_secondTemp = inputi8_second; + + for (int i = 0; i < ioBufferSize; i++) + { + *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128); + *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128); + inputTemp++; + inputi8Temp++; + input_secondTemp++; + inputi8_secondTemp++; + } + } + + // Run case-wise RPP API and measure time + + rppHandle_t handle; + rppCreateWithBatchSize(&handle, noOfImages); + clock_t start, end; + double start_omp, end_omp; + double cpu_time_used, omp_time_used; + + string test_case_name; + + switch (test_case) + { + case 0: + { + test_case_name = "brightness"; + + Rpp32f alpha[images]; + Rpp32f beta[images]; + for (i = 0; i < images; i++) + { + alpha[i] = 1.75; + beta[i] = 50; + + // xywhROI override sample + // roiTensorPtrSrc[i].xywhROI.xy.x = 0; + // roiTensorPtrSrc[i].xywhROI.xy.y = 0; + // roiTensorPtrSrc[i].xywhROI.roiWidth = 100; + // roiTensorPtrSrc[i].xywhROI.roiHeight = 180; + + // ltrbROI override sample + // roiTensorPtrSrc[i].ltrbROI.lt.x = 50; + // roiTensorPtrSrc[i].ltrbROI.lt.y = 50; + // roiTensorPtrSrc[i].ltrbROI.rb.x = 199; + // roiTensorPtrSrc[i].ltrbROI.rb.y = 149; + } + + // Change RpptRoiType for ltrbROI override sample + // roiTypeSrc = RpptRoiType::LTRB; + // roiTypeDst = RpptRoiType::LTRB; + + start_omp = omp_get_wtime(); + start = clock(); + if (ip_bitDepth == 0) + rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 1) + rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 2) + rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 3) + missingFuncFlag = 1; + else if (ip_bitDepth == 4) + missingFuncFlag = 1; + else if (ip_bitDepth == 5) + rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle); + else if (ip_bitDepth == 6) + missingFuncFlag = 1; + else + missingFuncFlag = 1; + end = clock(); + end_omp = omp_get_wtime(); + + break; + } + default: + missingFuncFlag = 1; + break; + } + + if (missingFuncFlag == 1) + { + printf("\nThe functionality %s doesn't yet exist in RPP\n", func); + return -1; + } + + // Display measured times + + cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + omp_time_used = end_omp - start_omp; + cout << "\nCPU Time - BatchPD : " << cpu_time_used; + cout << "\nOMP Time - BatchPD : " << omp_time_used; + printf("\n"); + + // Reconvert other bit depths to 8u for output display purposes + + string fileName = std::to_string(ip_bitDepth); + ofstream outputFile (fileName + ".csv"); + + if (ip_bitDepth == 0) + { + Rpp8u *outputTemp; + outputTemp = output; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32u) *outputTemp << ","; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + + } + else if ((ip_bitDepth == 1) || (ip_bitDepth == 3)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp16f *outputf16Temp; + outputf16Temp = outputf16; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf16Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf16Temp * 255.0); + outputf16Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + + } + else if ((ip_bitDepth == 2) || (ip_bitDepth == 4)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp32f *outputf32Temp; + outputf32Temp = outputf32; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << *outputf32Temp << ","; + *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0); + outputf32Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + else if ((ip_bitDepth == 5) || (ip_bitDepth == 6)) + { + Rpp8u *outputTemp; + outputTemp = output; + Rpp8s *outputi8Temp; + outputi8Temp = outputi8; + + if (outputFile.is_open()) + { + for (int i = 0; i < oBufferSize; i++) + { + outputFile << (Rpp32s) *outputi8Temp << ","; + *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128); + outputi8Temp++; + outputTemp++; + } + outputFile.close(); + } + else + cout << "Unable to open file!"; + } + + // Calculate exact dstROI in XYWH format for OpenCV dump + + if (roiTypeSrc == RpptRoiType::LTRB) + { + for (int i = 0; i < dstDescPtr->n; i++) + { + int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x; + int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y; + int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x; + int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y; + + roiTensorPtrSrc[i].xywhROI.xy.x = ltX; + roiTensorPtrSrc[i].xywhROI.xy.y = ltY; + roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1; + roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1; + } + } + + RpptROI roiDefault; + RpptROIPtr roiPtrDefault; + roiPtrDefault = &roiDefault; + roiPtrDefault->xywhROI.xy.x = 0; + roiPtrDefault->xywhROI.xy.y = 0; + roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w; + roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h; + + for (int i = 0; i < dstDescPtr->n; i++) + { + roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth); + roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight); + roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x); + roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y); + } + + // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump + + if (dstDescPtr->layout == RpptLayout::NCHW) + { + Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u)); + memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u)); + + Rpp8u *outputTemp, *outputCopyTemp; + outputTemp = output; + outputCopyTemp = outputCopy; + + for (int count = 0; count < dstDescPtr->n; count++) + { + Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB; + outputCopyTempR = outputCopyTemp; + outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride; + outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride; + + for (int i = 0; i < dstDescPtr->h; i++) + { + for (int j = 0; j < dstDescPtr->w; j++) + { + *outputTemp = *outputCopyTempR; + outputTemp++; + outputCopyTempR++; + *outputTemp = *outputCopyTempG; + outputTemp++; + outputCopyTempG++; + *outputTemp = *outputCopyTempB; + outputTemp++; + outputCopyTempB++; + } + } + + outputCopyTemp += dstDescPtr->strides.nStride; + } + + free(outputCopy); + } + + rppDestroyHost(handle); + + // OpenCV dump + + mkdir(dst, 0700); + strcat(dst, "/"); + count = 0; + elementsInRowMax = dstDescPtr->w * ip_channel; + + for (j = 0; j < dstDescPtr->n; j++) + { + int height = roiTensorPtrSrc[j].xywhROI.roiHeight; + int width = roiTensorPtrSrc[j].xywhROI.roiWidth; + + int op_size = height * width * ip_channel; + Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u)); + Rpp8u *temp_output_row; + temp_output_row = temp_output; + Rpp32u elementsInRow = width * ip_channel; + Rpp8u *output_row = output + count; + + for (int k = 0; k < height; k++) + { + memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u)); + temp_output_row += elementsInRow; + output_row += elementsInRowMax; + } + count += dstDescPtr->strides.nStride; + + char temp[1000]; + strcpy(temp, dst); + strcat(temp, imageNames[j]); + + Mat mat_op_image; + mat_op_image = Mat(height, width, CV_8UC3, temp_output); + imwrite(temp, mat_op_image); + + free(temp_output); + } + + // Free memory + + free(roiTensorPtrSrc); + free(roiTensorPtrDst); + free(input); + free(input_second); + free(output); + free(inputf16); + free(inputf16_second); + free(outputf16); + free(inputf32); + free(inputf32_second); + free(outputf32); + free(inputi8); + free(inputi8_second); + free(outputi8); + + return 0; +} diff --git a/utilities/rpp-unittests/HOST_NEW/testAllScript.sh b/utilities/rpp-unittests/HOST_NEW/testAllScript.sh index 302f0524f..10de9c954 100755 --- a/utilities/rpp-unittests/HOST_NEW/testAllScript.sh +++ b/utilities/rpp-unittests/HOST_NEW/testAllScript.sh @@ -196,6 +196,10 @@ do printf "\n./BatchPD_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + + printf "\n./Tensor_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + echo "------------------------------------------------------------------------------------------" done done @@ -252,6 +256,10 @@ do printf "\n./BatchPD_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + + printf "\n./Tensor_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + echo "------------------------------------------------------------------------------------------" done done @@ -308,6 +316,10 @@ do printf "\n./BatchPD_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" ./BatchPD_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + + printf "\n./Tensor_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0" + ./Tensor_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" + echo "------------------------------------------------------------------------------------------" done done @@ -328,7 +340,7 @@ then printf "\n\nUsage: ./uniqueFunctionalities_host f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> " - for ((case=0;case<13;case++)) + for ((case=0;case<12;case++)) do printf "\n\n\n\n" | tee -a "$DST_FOLDER/uniqueFunctionalities_host_log.txt" echo "--------------------------------" | tee -a "$DST_FOLDER/uniqueFunctionalities_host_log.txt" diff --git a/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp b/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp index 2f1f4bff5..dd3574ba0 100644 --- a/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp +++ b/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp @@ -109,6 +109,9 @@ int main(int argc, char **argv) printf("\nip_bitDepth = %d\ntest_case = %d", ip_bitDepth, test_case); + rppHandle_t handle; + rppCreate(&handle); + clock_t start, end; double start_omp, end_omp; double cpu_time_used, omp_time_used; @@ -122,54 +125,6 @@ int main(int argc, char **argv) { test_case_name = "tensor_transpose"; - // Test Case 1 - Rpp32u totalNumberOfElements = 36; - Rpp32u tensorDimension = 3; - Rpp32u tensorDimensionValues[3] = {3, 3, 4}; - Rpp32u dimension1 = 0, dimension2 = 1; - Rpp8u srcPtr[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16}; - Rpp8u dstPtr[36] = {0}; - - // Test Case 2 - // Rpp32u totalNumberOfElements = 48; - // Rpp32u tensorDimension = 3; - // Rpp32u tensorDimensionValues[3] = {4, 4, 3}; - // Rpp32u dimension1 = 0, dimension2 = 1; - // Rpp8u srcPtr[48] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 91, 95, 92, 98, 65, 66, 67, 68, 69, 70, 71, 72, 49, 47, 55, 51, 41, 39, 38, 34, 13, 24, 15, 16}; - // Rpp8u dstPtr[48] = {0}; - - start = clock(); - start_omp = omp_get_wtime(); - if (ip_bitDepth == 0) - rppi_tensor_transpose_u8_host(srcPtr, dstPtr, dimension1, dimension2, tensorDimension, tensorDimensionValues); - else if (ip_bitDepth == 1) - missingFuncFlag = 1; - else if (ip_bitDepth == 2) - missingFuncFlag = 1; - else if (ip_bitDepth == 3) - missingFuncFlag = 1; - else if (ip_bitDepth == 4) - missingFuncFlag = 1; - else if (ip_bitDepth == 5) - missingFuncFlag = 1; - else if (ip_bitDepth == 6) - missingFuncFlag = 1; - else - missingFuncFlag = 1; - end_omp = omp_get_wtime(); - end = clock(); - - printf("\n\nInput:\n"); - displayTensor(srcPtr, totalNumberOfElements); - printf("\n\nOutput of tensor_transpose:\n"); - displayTensor(dstPtr, totalNumberOfElements); - - break; - } - case 1: - { - test_case_name = "transpose"; - // Test Case 1 // Rpp32u totalNumberOfElements = 24; // Rpp32u perm[4] = {0, 3, 1, 2}; @@ -185,9 +140,9 @@ int main(int argc, char **argv) Rpp32u perm[4] = {0, 3, 1, 2}; Rpp32u shape[4] = {2, 4, 5, 3}; Rpp8u srcPtr[120] = { - 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 5, 4, 3, 2, 1, 0, - 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 115, 114, 113, 112, 111, 110, - 240, 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 140, 139, 138, 137, 136, 135, + 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 5, 4, 3, 2, 1, 0, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 115, 114, 113, 112, 111, 110, + 240, 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 140, 139, 138, 137, 136, 135, 70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 170, 169, 168, 167, 166, 165, 164, 163, 162, 161, 160, 159, 15, 14, 13, 12, 11, 10 }; Rpp8u dstPtr[120] = {0}; @@ -206,17 +161,17 @@ int main(int argc, char **argv) start = clock(); start_omp = omp_get_wtime(); if (ip_bitDepth == 0) - rppi_transpose_u8_host(srcPtr, dstPtr, perm, shape); + rppi_tensor_transpose_u8_host(srcPtr, dstPtr, shape, perm, handle); else if (ip_bitDepth == 1) - rppi_transpose_f16_host(srcPtr16f, dstPtr16f, perm, shape); + rppi_tensor_transpose_f16_host(srcPtr16f, dstPtr16f, shape, perm, handle); else if (ip_bitDepth == 2) - rppi_transpose_f32_host(srcPtr32f, dstPtr32f, perm, shape); + rppi_tensor_transpose_f32_host(srcPtr32f, dstPtr32f, shape, perm, handle); else if (ip_bitDepth == 3) missingFuncFlag = 1; else if (ip_bitDepth == 4) missingFuncFlag = 1; else if (ip_bitDepth == 5) - rppi_transpose_i8_host(srcPtr8s, dstPtr8s, perm, shape); + rppi_tensor_transpose_i8_host(srcPtr8s, dstPtr8s, shape, perm, handle); else if (ip_bitDepth == 6) missingFuncFlag = 1; else @@ -260,15 +215,15 @@ int main(int argc, char **argv) missingFuncFlag = 1; else missingFuncFlag = 1; - + break; } - case 2: + case 1: { test_case_name = "tensor_add"; rppHandle_t handle; - + Rpp8u srcPtr1[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16}; Rpp8u srcPtr2[36] = {16, 15, 24, 13, 72, 71, 70, 69, 68, 67, 66, 65, 108, 100, 111, 127, 121, 113, 117, 126, 127, 128, 129, 130, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; @@ -318,12 +273,12 @@ int main(int argc, char **argv) break; } - case 3: + case 2: { test_case_name = "tensor_subtract"; rppHandle_t handle; - + Rpp8u srcPtr1[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16}; Rpp8u srcPtr2[36] = {16, 15, 24, 13, 72, 71, 70, 69, 68, 67, 66, 65, 108, 100, 111, 127, 121, 113, 117, 126, 127, 128, 129, 130, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; @@ -373,12 +328,12 @@ int main(int argc, char **argv) break; } - case 4: + case 3: { test_case_name = "tensor_multiply"; rppHandle_t handle; - + Rpp8u srcPtr1[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16}; Rpp8u srcPtr2[36] = {16, 15, 24, 13, 72, 71, 70, 69, 68, 67, 66, 65, 108, 100, 111, 127, 121, 113, 117, 126, 127, 128, 129, 130, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255}; @@ -428,12 +383,12 @@ int main(int argc, char **argv) break; } - case 5: + case 4: { test_case_name = "tensor_matrix_multiply"; rppHandle_t handle; - + Rpp32u tensorDimensionValues1[2] = {3, 2}; Rpp32u tensorDimensionValues2[2] = {2, 4}; @@ -484,12 +439,12 @@ int main(int argc, char **argv) break; } - case 6: + case 5: { test_case_name = "min_max_loc"; rppHandle_t handle; - + Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16}; RppiSize srcSize1Channel, srcSize3Channel; @@ -544,12 +499,12 @@ int main(int argc, char **argv) break; } - case 7: + case 6: { test_case_name = "mean_stddev"; rppHandle_t handle; - + Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16}; RppiSize srcSize1Channel, srcSize3Channel; @@ -603,17 +558,17 @@ int main(int argc, char **argv) break; } - case 8: + case 7: { test_case_name = "control_flow"; rppHandle_t handle; - + bool b1 = true, b2 = false; bool b3 = true; Rpp8u u1 = 120, u2 = 100; Rpp8u u3 = 20; - + start = clock(); start_omp = omp_get_wtime(); rpp_bool_control_flow(b1, b2, &b3, RPP_SCALAR_OP_AND, handle ); @@ -638,10 +593,10 @@ int main(int argc, char **argv) break; } - case 9: + case 8: { test_case_name = "histogram"; - + rppHandle_t handle; int count = 0; @@ -650,7 +605,7 @@ int main(int argc, char **argv) RppiSize srcSize; Rpp32u *outputHistogram = (Rpp32u *) calloc (bins, sizeof(Rpp32u)); Rpp32u *outputHistogramTemp; - + memset(outputHistogram, 0, bins * sizeof(Rpp32u)); srcSize.height = 6; srcSize.width = 6; @@ -734,12 +689,12 @@ int main(int argc, char **argv) break; } - case 10: + case 9: { test_case_name = "convert_bit_depth"; rppHandle_t handle; - + Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16}; Rpp8s dstPtr8s[36]; Rpp16u dstPtr16u[36]; @@ -842,12 +797,12 @@ int main(int argc, char **argv) break; } - case 11: + case 10: { test_case_name = "tensor_convert_bit_depth"; rppHandle_t handle; - + Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16}; Rpp8s dstPtr8s[36]; Rpp16u dstPtr16u[36]; @@ -898,7 +853,7 @@ int main(int argc, char **argv) break; } - case 12: + case 11: { test_case_name = "tensor_look_up_table";