From ac1907fbd9294b952b7a798deacab87e380d6907 Mon Sep 17 00:00:00 2001
From: Abishek <52214183+r-abishek@users.noreply.github.com>
Date: Thu, 23 Sep 2021 13:50:30 -0700
Subject: [PATCH] RPP Tensor Support (#70)

* add definitions for rpp tensor api

* Initial commit

* Initial commit - pln1/pln3 tensor testsuite

* Mods for tensor test suite

* Mods for brightness tensor host

* arrangementParams to layoutParams

* Rename to tensor_augmentations

* Fix tensor host test suites

* Modify host tensor support for brightness

* Initial commit for tensor hip test suite

* Multiple of 8 stride option

* Add initial tensor support for hip

* Tensor test suite support for hip pln

* Fixes for GPU tensor support

* Add host ROI null check

* Initial commit for perf tests

* Perf tests for RPP tensor support

* Add gpu support for ltrb to xywh, remove roiType, fix pln3 brightness methods

* Remove method1 for pln3 gpu, keep method2

* Fix hip tensor unittests

* Add support for fused layout conversion on host

* Add tensor unittest suite support for layout toggle

* Add tensor perf tests for host - initial commit

* Add tensor host test suite for perf tests

* Add support for NHWC-NCHW toggle in HIP

* Add test suite support for layout toggle

* Reset hip unittests script

* Unroll pln3 kernel

* Add initial multi-bitDepth host support, remove templates

* Move SSE code to macros in rpp_cpu_simd

* Add support for f32 in brightness

* Macro changes, Add support for f16 brightness

* Add support for tensor i8

* Enable multi-bitDepth support in host perf tests

* Add initial multi-bitDepth support for HIP

* Add support for load24s in hip common, toggle layouts

* Enable perf tests for multi-bitdepth in hip test suite

* Fix bug in perf tests for tensor hip suite

* Add mods to use d_float8, d_float24 and d_uint6

* Add f16 support in hip

* Add f16 support in perf tests

* Reduce loads and stores

* Typecast to float4 mod

* Modify RPPMAX2/MIN2 to std::max/min

* Pass all arguments to sse macros

* Reduce scope of time vars

* Add omp_time_used

* Change host to hip in folder name and help

* Change error enums to negative

* Avoid pointer or index increment by collating loads

* Use variadic funcitons and pack templating to handle loads/stores

* Fix i8 blank image issue in hip

* Combine loads in f16/f32 and organize rpp_hip_common file

* Fix I8 store issue - trials

* Fix I8 store issue

* Add manual typecast to float4

* Use int4 to read roiTensorPtrSrc

* rppi_validate cleanup

* Test suite build fix

Co-authored-by: rrawther <Rajy.MeeyakhanRawther@amd.com>
---
 include/rpp.h                                 |    1 +
 include/rppdefs.h                             |  393 ++++--
 include/rppt.h                                |   14 +
 include/rppt_tensor_augmentations.h           |   72 ++
 src/include/cpu/rpp_cpu_common.hpp            |   25 +-
 src/include/cpu/rpp_cpu_simd.hpp              |  908 ++++++++++----
 src/include/hip/rpp_hip_common.hpp            |  548 ++++++++-
 src/modules/CMakeLists.txt                    |    4 +-
 src/modules/cpu/host_tensor_augmentations.hpp | 1068 +++++++++++++++++
 src/modules/hip/hip_tensor_augmentations.hpp  |   30 +
 src/modules/hip/kernel/brightness.hpp         |  288 +++++
 src/modules/hip/kernel/roi_conversion.hpp     |   32 +
 src/modules/hip/kernel/rpp_hip_host_decls.hpp |    4 +
 src/modules/rppi_validate.hpp                 |  403 +++----
 src/modules/rppt_tensor_augmentations.cpp     |  182 +++
 .../HIP_NEW/CMakeLists.txt                    |    6 +
 .../HIP_NEW/Tensor_hip_pkd3.cpp               |  635 ++++++++++
 .../HIP_NEW/Tensor_hip_pln1.cpp               |  632 ++++++++++
 .../HIP_NEW/Tensor_hip_pln3.cpp               |  710 +++++++++++
 .../HIP_NEW/generatePerformanceLogs.py        |   42 +-
 .../HIP_NEW/rawLogsGenScript.sh               |   63 +-
 .../HOST_NEW/CMakeLists.txt                   |   10 +-
 .../HOST_NEW/Tensor_host_pkd3.cpp             |  519 ++++++++
 .../HOST_NEW/Tensor_host_pln1.cpp             |  517 ++++++++
 .../HOST_NEW/Tensor_host_pln3.cpp             |  595 +++++++++
 .../HOST_NEW/generatePerformanceLogs.py       |    5 +-
 .../HOST_NEW/rawLogsGenScript.sh              |   12 +
 .../rpp-unittests/HIP_NEW/CMakeLists.txt      |    6 +
 .../rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp |  831 +++++++++++++
 .../rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp |  789 ++++++++++++
 .../rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp |  907 ++++++++++++++
 .../rpp-unittests/HIP_NEW/testAllScript.sh    |   12 +
 .../rpp-unittests/HOST_NEW/CMakeLists.txt     |    8 +-
 .../HOST_NEW/Tensor_host_pkd3.cpp             |  710 +++++++++++
 .../HOST_NEW/Tensor_host_pln1.cpp             |  669 +++++++++++
 .../HOST_NEW/Tensor_host_pln3.cpp             |  787 ++++++++++++
 .../rpp-unittests/HOST_NEW/testAllScript.sh   |   14 +-
 .../HOST_NEW/uniqueFunctionalities_host.cpp   |  113 +-
 38 files changed, 11886 insertions(+), 678 deletions(-)
 create mode 100644 include/rppt.h
 create mode 100644 include/rppt_tensor_augmentations.h
 create mode 100644 src/modules/cpu/host_tensor_augmentations.hpp
 create mode 100644 src/modules/hip/hip_tensor_augmentations.hpp
 create mode 100644 src/modules/hip/kernel/brightness.hpp
 create mode 100644 src/modules/hip/kernel/roi_conversion.hpp
 create mode 100644 src/modules/rppt_tensor_augmentations.cpp
 create mode 100644 utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp
 create mode 100644 utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp
 create mode 100644 utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp
 create mode 100644 utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp
 create mode 100644 utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp
 create mode 100644 utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp
 create mode 100644 utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp
 create mode 100644 utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp
 create mode 100644 utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp
 create mode 100644 utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp
 create mode 100644 utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp
 create mode 100644 utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp

diff --git a/include/rpp.h b/include/rpp.h
index daf6ed8f3..f8d5c092c 100644
--- a/include/rpp.h
+++ b/include/rpp.h
@@ -49,6 +49,7 @@ extern "C" {
 #include "rppcore.h"
 #include "rppdefs.h"
 #include "rppi.h"
+#include "rppt.h"
 #include "rppversion.h"
 
 
diff --git a/include/rppdefs.h b/include/rppdefs.h
index 9efd4d26a..7ac0e5bd2 100644
--- a/include/rppdefs.h
+++ b/include/rppdefs.h
@@ -2,16 +2,23 @@
    MulticoreWare Inc.
 */
 
-#ifndef RPPIDEFS_H
-#define RPPIDEFS_H
+#ifndef RPPDEFS_H
+#define RPPDEFS_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#include <stddef.h>
 #ifdef OCL_COMPILE
 #include <CL/cl.h>
 #endif
 
+
+
+
+
+/******************** RPP typedefs ********************/
+
 typedef unsigned char       Rpp8u;
 typedef signed char         Rpp8s;
 typedef unsigned short      Rpp16u;
@@ -22,28 +29,28 @@ typedef unsigned long long  Rpp64u;
 typedef long long           Rpp64s;
 typedef float               Rpp32f;
 typedef double              Rpp64f;
-typedef void*              RppPtr_t;
+typedef void*               RppPtr_t;
+typedef size_t              RppSize_t;
 
 typedef enum
 {
-    RPP_SUCCESS = 0,
-    RPP_ERROR   = 1,
+    RPP_SUCCESS             = 0,
+    RPP_ERROR               = -1,
 } RppStatus;
 
- typedef enum
- {
-    rppStatusSuccess = 0,
-    rppStatusBadParm   = 1,
-    rppStatusUnknownError = 2,
-    rppStatusNotInitialized = 3,
-    rppStatusInvalidValue = 4,
-    rppStatusAllocFailed = 5,
-    rppStatusInternalError = 6,
-    rppStatusNotImplemented = 7,
-    rppStatusUnsupportedOp = 8,
+typedef enum
+{
+    rppStatusSuccess        = 0,
+    rppStatusBadParm        = -1,
+    rppStatusUnknownError   = -2,
+    rppStatusNotInitialized = -3,
+    rppStatusInvalidValue   = -4,
+    rppStatusAllocFailed    = -5,
+    rppStatusInternalError  = -6,
+    rppStatusNotImplemented = -7,
+    rppStatusUnsupportedOp  = -8,
 } rppStatus_t;
 
-
 typedef enum
 {
     RPPI_HORIZONTAL_AXIS,
@@ -53,28 +60,52 @@ typedef enum
 
 typedef enum
 {
-  RPP_SCALAR_OP_AND = 1,
-  RPP_SCALAR_OP_OR,
-  RPP_SCALAR_OP_XOR,
-  RPP_SCALAR_OP_NAND,
-  RPP_SCALAR_OP_EQUAL,
-  RPP_SCALAR_OP_NOTEQUAL,
-  RPP_SCALAR_OP_LESS,
-  RPP_SCALAR_OP_LESSEQ,
-  RPP_SCALAR_OP_GREATER,
-  RPP_SCALAR_OP_GREATEREQ,
-  RPP_SCALAR_OP_ADD,
-  RPP_SCALAR_OP_SUBTRACT,
-  RPP_SCALAR_OP_MULTIPLY,
-  RPP_SCALAR_OP_DIVIDE,
-  RPP_SCALAR_OP_MODULUS,
-  RPP_SCALAR_OP_MIN,
-  RPP_SCALAR_OP_MAX,
+    RPP_SCALAR_OP_AND       = 1,
+    RPP_SCALAR_OP_OR,
+    RPP_SCALAR_OP_XOR,
+    RPP_SCALAR_OP_NAND,
+    RPP_SCALAR_OP_EQUAL,
+    RPP_SCALAR_OP_NOTEQUAL,
+    RPP_SCALAR_OP_LESS,
+    RPP_SCALAR_OP_LESSEQ,
+    RPP_SCALAR_OP_GREATER,
+    RPP_SCALAR_OP_GREATEREQ,
+    RPP_SCALAR_OP_ADD,
+    RPP_SCALAR_OP_SUBTRACT,
+    RPP_SCALAR_OP_MULTIPLY,
+    RPP_SCALAR_OP_DIVIDE,
+    RPP_SCALAR_OP_MODULUS,
+    RPP_SCALAR_OP_MIN,
+    RPP_SCALAR_OP_MAX,
 } RppOp;
 
 typedef enum
 {
-    RGB_HSV = 1,
+    U8_S8,
+    S8_U8,
+} RppConvertBitDepthMode;
+
+typedef struct
+{
+    Rpp32f rho;
+    Rpp32f theta;
+} RppPointPolar;
+
+typedef struct
+{
+    Rpp32u channelParam;
+    Rpp32u bufferMultiplier;
+} RppLayoutParams;
+
+
+
+
+
+/******************** RPPI typedefs ********************/
+
+typedef enum
+{
+    RGB_HSV                 = 1,
     HSV_RGB
 } RppiColorConvertMode;
 
@@ -91,33 +122,36 @@ typedef enum
     RPPI_CHN_PACKED
 } RppiChnFormat;
 
-typedef struct {
+typedef struct
+{
     unsigned int width;
     unsigned int height;
-    } RppiSize;
+} RppiSize;
 
-typedef struct{
-     int x;
-     int y;
+typedef struct
+{
+    int x;
+    int y;
 } RppiPoint;
 
 typedef struct
-   {
-       int x;
-       int y;
-       int width;
-       int height;
-   } RppiRect;
+{
+    int x;
+    int y;
+    int width;
+    int height;
+} RppiRect;
 
-// roiHeight & roiWidth needs to be changed to xend & yend
-typedef struct {
+typedef struct
+{
     unsigned int x;
     unsigned int y;
     unsigned int roiWidth;
     unsigned int roiHeight;
-    } RppiROI;
+} RppiROI;
 
-typedef enum{
+typedef enum
+{
     GAUSS3,
     GAUSS5,
     GAUSS3x1,
@@ -126,62 +160,132 @@ typedef enum{
     AVG5
 } RppiBlur;
 
-typedef enum{
+typedef enum
+{
     ZEROPAD,
     NOPAD
 } RppiPad;
 
-typedef enum{
-    U8_S8,
-    S8_U8,
-} RppConvertBitDepthMode;
-
-typedef enum{
+typedef enum
+{
     RGB,
     HSV
 } RppiFormat;
 
-typedef struct {
-       Rpp32f rho;
-       Rpp32f theta;
-   } RppPointPolar;
 
-typedef struct{
+
+
+
+/******************** RPPT typedefs ********************/
+
+typedef enum
+{
+    U8,
+    F32,
+    F16,
+    I8
+} RpptDataType;
+
+typedef enum
+{
+    NCHW,
+    NHWC
+} RpptLayout;
+
+typedef enum
+{
+    LTRB,
+    XYWH
+
+} RpptRoiType;
+
+typedef struct
+{
+    RppiPoint lt, rb;
+
+} RpptRoiLtrb;
+
+typedef struct
+{
+    RppiPoint xy;
+    int roiWidth, roiHeight;
+
+} RpptRoiXywh;
+
+typedef union
+{
+    RpptRoiLtrb ltrbROI;
+    RpptRoiXywh xywhROI;
+
+} RpptROI, *RpptROIPtr;
+
+typedef struct
+{
+    Rpp32u nStride;
+    Rpp32u cStride;
+    Rpp32u hStride;
+    Rpp32u wStride;
+} RpptStrides;
+
+typedef struct
+{
+    RppSize_t numDims;
+    Rpp32u offset;
+    RpptDataType dataType;
+    RpptLayout layout;
+    Rpp32u n, c, h, w;
+    RpptStrides strides;
+} RpptDesc, *RpptDescPtr;
+
+
+
+
+
+/******************** HOST memory typedefs ********************/
+
+typedef struct
+{
     Rpp64f *doublemem;
-}memRpp64f;
+} memRpp64f;
 
-typedef struct{
+typedef struct
+{
     Rpp32f *floatmem;
-}memRpp32f;
+} memRpp32f;
 
-typedef struct{
+typedef struct
+{
     Rpp32u *uintmem;
-}memRpp32u;
+} memRpp32u;
 
-typedef struct{
+typedef struct
+{
     Rpp32s *intmem;
-}memRpp32s;
+} memRpp32s;
 
-typedef struct{
+typedef struct
+{
     Rpp8u *ucharmem;
-}memRpp8u;
+} memRpp8u;
 
-typedef struct{
+typedef struct
+{
     Rpp8s *charmem;
-}memRpp8s;
+} memRpp8s;
 
-typedef struct{
+typedef struct
+{
     Rpp32u *height;
     Rpp32u *width;
-}memSize;
+} memSize;
 
-// roiHeight & roiWidth needs to be changed to xend & yend
-typedef struct{
+typedef struct
+{
     Rpp32u *x;
     Rpp32u *y;
     Rpp32u *roiHeight;
     Rpp32u *roiWidth;
-}memRoi;
+} memROI;
 
 typedef struct {
     RppiSize *srcSize;
@@ -199,57 +303,73 @@ typedef struct {
     Rpp64u *dstBatchIndex;
     Rpp32u *inc;
     Rpp32u *dstInc;
-}memCPU;
+} memCPU;
+
+
+
+
 
 #ifdef OCL_COMPILE
 
-typedef struct{
+/******************** OCL memory typedefs ********************/
+
+typedef struct
+{
     cl_mem floatmem;
-}clmemRpp32f;
+} clmemRpp32f;
 
 
-typedef struct{
+typedef struct
+{
     cl_mem doublemem;
-}clmemRpp64f;
+} clmemRpp64f;
 
-typedef struct{
+typedef struct
+{
     cl_mem uintmem;
-}clmemRpp32u;
+} clmemRpp32u;
 
-typedef struct{
+typedef struct
+{
     cl_mem intmem;
-}clmemRpp32s;
+} clmemRpp32s;
 
-typedef struct{
+typedef struct
+{
     cl_mem ucharmem;
-}clmemRpp8u;
+} clmemRpp8u;
 
-typedef struct{
+typedef struct
+{
     cl_mem charmem;
-}clmemRpp8s;
+} clmemRpp8s;
 
-typedef struct{
+typedef struct
+{
     cl_mem height;
     cl_mem width;
-}clmemSize;
+} clmemSize;
 
-typedef struct{
+typedef struct
+{
     cl_mem x;
     cl_mem y;
     cl_mem roiHeight;
     cl_mem roiWidth;
-}clmemRoi;
-typedef struct{
+} clmemROI;
+
+typedef struct
+{
     memSize csrcSize;
     memSize cdstSize;
     memSize cmaxSrcSize;
     memSize cmaxDstSize;
-    memRoi croiPoints;
+    memROI croiPoints;
     clmemSize srcSize;
     clmemSize dstSize;
     clmemSize maxSrcSize;
     clmemSize maxDstSize;
-    clmemRoi roiPoints;
+    clmemROI roiPoints;
     clmemRpp32f floatArr[10];
     clmemRpp64f doubleArr[10];
     clmemRpp32u uintArr[10];
@@ -263,54 +383,69 @@ typedef struct{
 } memGPU;
 
 
-#else
-typedef struct{
+
+
+
+#elif defined(HIP_COMPILE)
+
+/******************** HIP memory typedefs ********************/
+
+typedef struct
+{
     Rpp32f* floatmem;
-}hipMemRpp32f;
+} hipMemRpp32f;
 
-typedef struct{
+typedef struct
+{
     Rpp64f* doublemem;
-}hipMemRpp64f;
+} hipMemRpp64f;
 
-typedef struct{
+typedef struct
+{
     Rpp32u* uintmem;
-}hipMemRpp32u;
+} hipMemRpp32u;
 
-typedef struct{
+typedef struct
+{
     Rpp32s* intmem;
-}hipMemRpp32s;
+} hipMemRpp32s;
 
-typedef struct{
+typedef struct
+{
     Rpp8u* ucharmem;
-}hipMemRpp8u;
+} hipMemRpp8u;
 
-typedef struct{
+typedef struct
+{
     Rpp8s* charmem;
-}hipMemRpp8s;
+} hipMemRpp8s;
 
-typedef struct{
+typedef struct
+{
     Rpp32u* height;
     Rpp32u* width;
-}hipMemSize;
+} hipMemSize;
 
-// roiHeight & roiWidth needs to be changed to xend & yend
-typedef struct{
+typedef struct
+{
     Rpp32u* x;
     Rpp32u* y;
     Rpp32u* roiHeight;
     Rpp32u* roiWidth;
-}hipMemRoi;
-typedef struct{
+} hipMemROI;
+
+typedef struct
+{
     memSize csrcSize;
     memSize cdstSize;
     memSize cmaxSrcSize;
     memSize cmaxDstSize;
-    memRoi croiPoints;
+    memROI croiPoints;
     hipMemSize srcSize;
     hipMemSize dstSize;
     hipMemSize maxSrcSize;
     hipMemSize maxDstSize;
-    hipMemRoi roiPoints;
+    hipMemROI roiPoints;
     hipMemRpp32f floatArr[10];
     hipMemRpp64f doubleArr[10];
     hipMemRpp32u uintArr[10];
@@ -323,14 +458,22 @@ typedef struct{
     Rpp32u* dstInc;
 } memGPU;
 
-#endif
+#endif //BACKEND
 
-typedef struct{
-        memCPU mcpu;
-        memGPU mgpu;
+
+
+
+
+/******************** Memory management and handle typedefs ********************/
+
+typedef struct
+{
+    memCPU mcpu;
+    memGPU mgpu;
 } memMgmt;
 
-typedef struct{
+typedef struct
+{
     RppPtr_t cpuHandle;
     Rpp32u nbatchSize;
     memMgmt mem;
@@ -344,4 +487,4 @@ typedef struct{
 #ifdef __cplusplus
 }
 #endif
-#endif /* RPPIDEFS_H */
+#endif /* RPPDEFS_H */
diff --git a/include/rppt.h b/include/rppt.h
new file mode 100644
index 000000000..3f1b54686
--- /dev/null
+++ b/include/rppt.h
@@ -0,0 +1,14 @@
+#ifndef RPPT_H
+#define RPPT_H
+
+#include "rpp.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "rppt_tensor_augmentations.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* RPPT_H */
diff --git a/include/rppt_tensor_augmentations.h b/include/rppt_tensor_augmentations.h
new file mode 100644
index 000000000..31678594a
--- /dev/null
+++ b/include/rppt_tensor_augmentations.h
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RPPT_TENSOR_AUGMENTATIONS_H
+#define RPPT_TENSOR_AUGMENTATIONS_H
+#include "rpp.h"
+#include "rppdefs.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ----------------------------------------
+// CPU brightness functions declaration
+// ----------------------------------------
+/* Brightness augmentation for a NCHW/NHWC layout tensor
+*param[in] srcPtr source tensor memory
+*param[in] srcDesc source tensor descriptor
+*param[out] dstPtr destination tensor memory
+*param[in] dstDesc destination tensor descriptor
+*param[in] alphaTensor alpha values for brightness calculation (1D tensor of size batchSize with 0 <= alpha <= 20 for each image in batch)
+*param[in] betaTensor beta values for brightness calculation (1D tensor of size batchSize with 0 <= beta <= 255 for each image in batch)
+*param[in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+*param[in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+*returns a  RppStatus enumeration.
+*retval RPP_SUCCESS : succesful completion
+*retval RPP_ERROR : Error
+*/
+RppStatus
+rppt_brightness_host(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, Rpp32f *betaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+// ----------------------------------------
+// GPU brightness functions declaration
+// ----------------------------------------
+/* Brightness augmentation for a NCHW/NHWC layout tensor
+*param[in] srcPtr source tensor memory
+*param[in] srcDesc source tensor descriptor
+*param[out] dstPtr destination tensor memory
+*param[in] dstDesc destination tensor descriptor
+*param[in] alphaTensor alpha values for brightness calculation (1D tensor of size batchSize with 0 <= alpha <= 20 for each image in batch)
+*param[in] betaTensor beta values for brightness calculation (1D tensor of size batchSize with 0 <= beta <= 255 for each image in batch)
+*param[in] roiTensorSrc ROI data for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
+*param[in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
+*returns a  RppStatus enumeration.
+*retval RPP_SUCCESS : succesful completion
+*retval RPP_ERROR : Error
+*/
+RppStatus
+rppt_brightness_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, Rpp32f *betaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index f320757b3..66a3d07f9 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -8,8 +8,8 @@
 #include <rppdefs.h>
 #include <omp.h>
 #include <half.hpp>
-using half_float::half;
-typedef half                Rpp16f;
+using halfhpp = half_float::half;
+typedef halfhpp Rpp16f;
 #include "rpp_cpu_simd.hpp"
 
 #define PI                              3.14159265
@@ -25,6 +25,7 @@ typedef half                Rpp16f;
 #define RPPCEIL(a)                      ((int) (a + 1.0))
 #define RPPISEVEN(a)                    ((a % 2 == 0) ? 1 : 0)
 #define RPPPIXELCHECK(pixel)            (pixel < (Rpp32f) 0) ? ((Rpp32f) 0) : ((pixel < (Rpp32f) 255) ? pixel : ((Rpp32f) 255))
+#define RPPPIXELCHECKF32(pixel)         (pixel < (Rpp32f) 0) ? ((Rpp32f) 0) : ((pixel < (Rpp32f) 1) ? pixel : ((Rpp32f) 1))
 #define RPPPIXELCHECKI8(pixel)          (pixel < (Rpp32f) -128) ? ((Rpp32f) -128) : ((pixel < (Rpp32f) 127) ? pixel : ((Rpp32f) 127))
 #define RPPISGREATER(pixel, value)      ((pixel > value) ? 1 : 0)
 #define RPPISLESSER(pixel, value)       ((pixel < value) ? 1 : 0)
@@ -2078,6 +2079,26 @@ inline RppStatus custom_convolve_image_host(T* srcPtr, RppiSize srcSize, U* dstP
 
 // Compute Functions
 
+inline RppStatus compute_xywh_from_ltrb_host(RpptROIPtr roiPtrInput, RpptROIPtr roiPtrImage)
+{
+    roiPtrImage->xywhROI.xy.x = roiPtrInput->ltrbROI.lt.x;
+    roiPtrImage->xywhROI.xy.y = roiPtrInput->ltrbROI.lt.y;
+    roiPtrImage->xywhROI.roiWidth = roiPtrInput->ltrbROI.rb.x - roiPtrInput->ltrbROI.lt.x + 1;
+    roiPtrImage->xywhROI.roiHeight = roiPtrInput->ltrbROI.rb.y - roiPtrInput->ltrbROI.lt.y + 1;
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus compute_roi_boundary_check_host(RpptROIPtr roiPtrImage, RpptROIPtr roiPtr, RpptROIPtr roiPtrDefault)
+{
+    roiPtr->xywhROI.xy.x = std::max(roiPtrDefault->xywhROI.xy.x, roiPtrImage->xywhROI.xy.x);
+    roiPtr->xywhROI.xy.y = std::max(roiPtrDefault->xywhROI.xy.y, roiPtrImage->xywhROI.xy.y);
+    roiPtr->xywhROI.roiWidth = std::min(roiPtrDefault->xywhROI.roiWidth - roiPtrImage->xywhROI.xy.x, roiPtrImage->xywhROI.roiWidth);
+    roiPtr->xywhROI.roiHeight = std::min(roiPtrDefault->xywhROI.roiHeight - roiPtrImage->xywhROI.xy.y, roiPtrImage->xywhROI.roiHeight);
+
+    return RPP_SUCCESS;
+}
+
 template<typename T>
 inline RppStatus compute_subimage_location_host(T* ptr, T** ptrSubImage,
                                          RppiSize size, RppiSize *sizeSubImage,
diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
index 0209e7512..054ae1194 100644
--- a/src/include/cpu/rpp_cpu_simd.hpp
+++ b/src/include/cpu/rpp_cpu_simd.hpp
@@ -1,6 +1,6 @@
 #ifndef AMD_RPP_RPP_CPU_SIMD_HPP
 #define AMD_RPP_RPP_CPU_SIMD_HPP
-#if 1
+
 #if _WIN32
 #include <intrin.h>
 #else
@@ -10,45 +10,491 @@
 #endif
 
 #define __AVX2__ 1
+#define __SSE4_1__ 1
 
 #define M256I(m256i_register) (*((_m256i_union*)&m256i_register))
-typedef union {
-    char               m256i_i8[32];
-    short              m256i_i16[16];
-    int                m256i_i32[8];
-    long long          m256i_i64[4];
-    __m128i            m256i_i128[2];
-}_m256i_union;
+typedef union
+{
+    char m256i_i8[32];
+    short m256i_i16[16];
+    int m256i_i32[8];
+    long long m256i_i64[4];
+    __m128i m256i_i128[2];
+} _m256i_union;
 
 #if defined(_MSC_VER)
 #define SIMD_ALIGN_VAR(type, name, alignment) \
-  __declspec(align(alignment)) type name
+    __declspec(align(alignment)) type name
 #else
 #define SIMD_ALIGN_VAR(type, name, alignment) \
-  type __attribute__((__aligned__(alignment))) name
+    type __attribute__((__aligned__(alignment))) name
 #endif // _MSC_VER
 
 #define SIMD_CONST_PI(name, val0, val1, val2, val3) \
-  SIMD_ALIGN_VAR(static const int, _xmm_const_##name[4], 16) = { \
-    static_cast<int>(val3), \
-    static_cast<int>(val2), \
-    static_cast<int>(val1), \
-    static_cast<int>(val0)  \
-  }
+    SIMD_ALIGN_VAR(static const int, _xmm_const_##name[4], 16) = { \
+        static_cast<int>(val3), \
+        static_cast<int>(val2), \
+        static_cast<int>(val1), \
+        static_cast<int>(val0)  \
+    }
 
 #define SIMD_CONST_PS(name, val0, val1, val2, val3) \
-  SIMD_ALIGN_VAR(static const float, _xmm_const_##name[4], 16) = { \
-    static_cast<float>(val3), \
-    static_cast<float>(val2), \
-    static_cast<float>(val1), \
-    static_cast<float>(val0)  \
-  }
+    SIMD_ALIGN_VAR(static const float, _xmm_const_##name[4], 16) = { \
+        static_cast<float>(val3), \
+        static_cast<float>(val2), \
+        static_cast<float>(val1), \
+        static_cast<float>(val0)  \
+    }
 
 #define SIMD_GET_PS(name) (*(const __m128  *)_xmm_const_##name)
 
+inline RppStatus rpp_load48_u8pkd3_to_f32pln3(Rpp8u *srcPtr, __m128 *p)
+{
+    __m128i px[8];
+    __m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15);
+    __m128i pxZero = _mm_setzero_si128();
+
+    px[0] = _mm_loadu_si128((__m128i *)srcPtr);           /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */
+    px[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12));    /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */
+    px[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24));    /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 \ */
+    px[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36));    /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 \ */
+    px[0] = _mm_shuffle_epi8(px[0], pxMask);    /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */
+    px[1] = _mm_shuffle_epi8(px[1], pxMask);    /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */
+    px[2] = _mm_shuffle_epi8(px[2], pxMask);    /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */
+    px[3] = _mm_shuffle_epi8(px[3], pxMask);    /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */
+    px[4] = _mm_unpackhi_epi8(px[0], pxZero);    /* unpack 8 hi-pixels of px[0] */
+    px[5] = _mm_unpackhi_epi8(px[1], pxZero);    /* unpack 8 hi-pixels of px[1] */
+    px[6] = _mm_unpackhi_epi8(px[2], pxZero);    /* unpack 8 hi-pixels of px[2] */
+    px[7] = _mm_unpackhi_epi8(px[3], pxZero);    /* unpack 8 hi-pixels of px[3] */
+    px[0] = _mm_unpacklo_epi8(px[0], pxZero);    /* unpack 8 lo-pixels of px[0] */
+    px[1] = _mm_unpacklo_epi8(px[1], pxZero);    /* unpack 8 lo-pixels of px[1] */
+    px[2] = _mm_unpacklo_epi8(px[2], pxZero);    /* unpack 8 lo-pixels of px[2] */
+    px[3] = _mm_unpacklo_epi8(px[3], pxZero);    /* unpack 8 lo-pixels of px[3] */
+    p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero));    /* unpack 4 lo-pixels of px[0] - Contains R01-04 */
+    p[1] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero));    /* unpack 4 lo-pixels of px[1] - Contains R05-08 */
+    p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero));    /* unpack 4 lo-pixels of px[2] - Contains R09-12 */
+    p[3] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero));    /* unpack 4 lo-pixels of px[3] - Contains R13-16 */
+    p[4] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero));    /* unpack 4 hi-pixels of px[0] - Contains G01-04 */
+    p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero));    /* unpack 4 hi-pixels of px[1] - Contains G05-08 */
+    p[6] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero));    /* unpack 4 hi-pixels of px[2] - Contains G09-12 */
+    p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero));    /* unpack 4 hi-pixels of px[3] - Contains G13-16 */
+    p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero));    /* unpack 4 lo-pixels of px[4] - Contains B01-04 */
+    p[9] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero));    /* unpack 4 lo-pixels of px[5] - Contains B05-08 */
+    p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[6], pxZero));    /* unpack 4 lo-pixels of px[6] - Contains B09-12 */
+    p[11] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[7], pxZero));    /* unpack 4 lo-pixels of px[7] - Contains B13-16 */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store48_f32pln3_to_u8pln3(Rpp8u *dstPtrR, Rpp8u *dstPtrG, Rpp8u *dstPtrB, __m128 *p)
+{
+    __m128i px[8];
+
+    px[4] = _mm_cvtps_epi32(p[0]);    /* convert to int32 for R */
+    px[5] = _mm_cvtps_epi32(p[1]);    /* convert to int32 for R */
+    px[6] = _mm_cvtps_epi32(p[2]);    /* convert to int32 for R */
+    px[7] = _mm_cvtps_epi32(p[3]);    /* convert to int32 for R */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 for R */
+    px[5] = _mm_packus_epi32(px[6], px[7]);    /* pack pixels 8-15 for R */
+    px[0] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 for R */
+    px[4] = _mm_cvtps_epi32(p[4]);    /* convert to int32 for G */
+    px[5] = _mm_cvtps_epi32(p[5]);    /* convert to int32 for G */
+    px[6] = _mm_cvtps_epi32(p[6]);    /* convert to int32 for G */
+    px[7] = _mm_cvtps_epi32(p[7]);    /* convert to int32 for G */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 for G */
+    px[5] = _mm_packus_epi32(px[6], px[7]);    /* pack pixels 8-15 for G */
+    px[1] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 for G */
+    px[4] = _mm_cvtps_epi32(p[8]);    /* convert to int32 for B */
+    px[5] = _mm_cvtps_epi32(p[9]);    /* convert to int32 for B */
+    px[6] = _mm_cvtps_epi32(p[10]);    /* convert to int32 for B */
+    px[7] = _mm_cvtps_epi32(p[11]);    /* convert to int32 for B */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 for B */
+    px[5] = _mm_packus_epi32(px[6], px[7]);    /* pack pixels 8-15 for B */
+    px[2] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 for B */
+    _mm_storeu_si128((__m128i *)dstPtrR, px[0]);    /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+    _mm_storeu_si128((__m128i *)dstPtrG, px[1]);    /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+    _mm_storeu_si128((__m128i *)dstPtrB, px[2]);    /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load48_u8pln3_to_f32pln3(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m128 *p)
+{
+    __m128i px[6];
+    __m128i pxZero = _mm_setzero_si128();
+
+    px[0] = _mm_loadu_si128((__m128i *)srcPtrR);    /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+    px[1] = _mm_loadu_si128((__m128i *)srcPtrG);    /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+    px[2] = _mm_loadu_si128((__m128i *)srcPtrB);    /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+    px[3] = _mm_unpackhi_epi8(px[0], pxZero);    /* unpack 8 hi-pixels of px[0] */
+    px[4] = _mm_unpackhi_epi8(px[1], pxZero);    /* unpack 8 hi-pixels of px[1] */
+    px[5] = _mm_unpackhi_epi8(px[2], pxZero);    /* unpack 8 hi-pixels of px[2] */
+    px[0] = _mm_unpacklo_epi8(px[0], pxZero);    /* unpack 8 lo-pixels of px[0] */
+    px[1] = _mm_unpacklo_epi8(px[1], pxZero);    /* unpack 8 lo-pixels of px[1] */
+    px[2] = _mm_unpacklo_epi8(px[2], pxZero);    /* unpack 8 lo-pixels of px[2] */
+    p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero));    /* pixels 0-3 of original px[0] containing 16 R values */
+    p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero));    /* pixels 4-7 of original px[0] containing 16 R values */
+    p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero));    /* pixels 8-11 of original px[0] containing 16 R values */
+    p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero));    /* pixels 12-15 of original px[0] containing 16 R values */
+    p[4] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero));    /* pixels 0-3 of original px[1] containing 16 G values */
+    p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero));    /* pixels 4-7 of original px[1] containing 16 G values */
+    p[6] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero));    /* pixels 8-11 of original px[1] containing 16 G values */
+    p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[4], pxZero));    /* pixels 12-15 of original px[1] containing 16 G values */
+    p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero));    /* pixels 0-3 of original px[1] containing 16 B values */
+    p[9] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero));    /* pixels 4-7 of original px[1] containing 16 B values */
+    p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero));    /* pixels 8-11 of original px[1] containing 16 B values */
+    p[11] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[5], pxZero));    /* pixels 12-15 of original px[1] containing 16 B values */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store48_f32pln3_to_u8pkd3(Rpp8u *dstPtr, __m128 *p)
+{
+    __m128i px[7];
+    __m128i pxMask = _mm_setr_epi8(0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 13, 14, 15);
+    __m128 pZero = _mm_setzero_ps();
+
+    px[4] = _mm_cvtps_epi32(p[0]);    /* convert to int32 for R01-04 */
+    px[5] = _mm_cvtps_epi32(p[4]);    /* convert to int32 for G01-04 */
+    px[6] = _mm_cvtps_epi32(p[8]);    /* convert to int32 for B01-04 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R01-04|G01-04 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B01-04|X01-04 */
+    px[0] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R01|R02|R03|R04|G01|G02|G03|G04|B01|B02|B03|B04|00|00|00|00] */
+    px[4] = _mm_cvtps_epi32(p[1]);    /* convert to int32 for R05-08 */
+    px[5] = _mm_cvtps_epi32(p[5]);    /* convert to int32 for G05-08 */
+    px[6] = _mm_cvtps_epi32(p[9]);    /* convert to int32 for B05-08 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R05-08|G05-08 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B05-08|X01-04 */
+    px[1] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R05|R06|R07|R08|G05|G06|G07|G08|B05|B06|B07|B08|00|00|00|00] */
+    px[4] = _mm_cvtps_epi32(p[2]);    /* convert to int32 for R09-12 */
+    px[5] = _mm_cvtps_epi32(p[6]);    /* convert to int32 for G09-12 */
+    px[6] = _mm_cvtps_epi32(p[10]);    /* convert to int32 for B09-12 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R09-12|G09-12 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B09-12|X01-04 */
+    px[2] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R09|R10|R11|R12|G09|G10|G11|G12|B09|B10|B11|B12|00|00|00|00] */
+    px[4] = _mm_cvtps_epi32(p[3]);    /* convert to int32 for R13-16 */
+    px[5] = _mm_cvtps_epi32(p[7]);    /* convert to int32 for G13-16 */
+    px[6] = _mm_cvtps_epi32(p[11]);    /* convert to int32 for B13-16 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R13-16|G13-16 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B13-16|X01-04 */
+    px[3] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R13|R14|R15|R16|G13|G14|G15|G16|B13|B14|B15|B16|00|00|00|00] */
+    px[0] = _mm_shuffle_epi8(px[0], pxMask);    /* shuffle to get [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+    px[1] = _mm_shuffle_epi8(px[1], pxMask);    /* shuffle to get [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+    px[2] = _mm_shuffle_epi8(px[2], pxMask);    /* shuffle to get [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+    px[3] = _mm_shuffle_epi8(px[3], pxMask);    /* shuffle to get [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)dstPtr, px[0]);           /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)(dstPtr + 12), px[1]);    /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)(dstPtr + 24), px[2]);    /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]);    /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load16_u8_to_f32(Rpp8u *srcPtr, __m128 *p)
+{
+    __m128i px[2];
+    __m128i pxZero = _mm_setzero_si128();
+
+    px[0] =  _mm_loadu_si128((__m128i *)srcPtr);    /* load pixels 0-15 */
+    px[1] = _mm_unpackhi_epi8(px[0], pxZero);    /* pixels 8-15 */
+    px[0] = _mm_unpacklo_epi8(px[0], pxZero);    /* pixels 0-7 */
+    p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero));    /* pixels 0-3 */
+    p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero));    /* pixels 4-7 */
+    p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero));    /* pixels 8-11 */
+    p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero));    /* pixels 12-15 */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store16_f32_to_u8(Rpp8u *dstPtr, __m128 *p)
+{
+    __m128i px[4];
+
+    px[0] = _mm_cvtps_epi32(p[0]);    /* pixels 0-3 */
+    px[1] = _mm_cvtps_epi32(p[1]);    /* pixels 4-7 */
+    px[2] = _mm_cvtps_epi32(p[2]);    /* pixels 8-11 */
+    px[3] = _mm_cvtps_epi32(p[3]);    /* pixels 12-15 */
+    px[0] = _mm_packus_epi32(px[0], px[1]);    /* pixels 0-7 */
+    px[1] = _mm_packus_epi32(px[2], px[3]);    /* pixels 8-15 */
+    px[0] = _mm_packus_epi16(px[0], px[1]);    /* pixels 0-15 */
+    _mm_storeu_si128((__m128i *)dstPtr, px[0]);    /* store pixels 0-15 */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load12_f32pkd3_to_f32pln3(Rpp32f *srcPtr, __m128 *p)
+{
+    p[0] = _mm_loadu_ps(srcPtr);
+    p[1] = _mm_loadu_ps(srcPtr + 3);
+    p[2] = _mm_loadu_ps(srcPtr + 6);
+    p[3] = _mm_loadu_ps(srcPtr + 9);
+    _MM_TRANSPOSE4_PS(p[0], p[1], p[2], p[3]);
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store12_f32pln3_to_f32pln3(Rpp32f *dstPtrR, Rpp32f *dstPtrG, Rpp32f *dstPtrB, __m128 *p)
+{
+    _mm_storeu_ps(dstPtrR, p[0]);
+    _mm_storeu_ps(dstPtrG, p[1]);
+    _mm_storeu_ps(dstPtrB, p[2]);
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load12_f32pln3_to_f32pln3(Rpp32f *srcPtrR, Rpp32f *srcPtrG, Rpp32f *srcPtrB, __m128 *p)
+{
+    p[0] = _mm_loadu_ps(srcPtrR);
+    p[1] = _mm_loadu_ps(srcPtrG);
+    p[2] = _mm_loadu_ps(srcPtrB);
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store12_f32pln3_to_f32pkd3(Rpp32f *dstPtr, __m128 *p)
+{
+    _MM_TRANSPOSE4_PS(p[0], p[1], p[2], p[3]);
+    _mm_storeu_ps(dstPtr, p[0]);
+    _mm_storeu_ps(dstPtr + 3, p[1]);
+    _mm_storeu_ps(dstPtr + 6, p[2]);
+    _mm_storeu_ps(dstPtr + 9, p[3]);
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load4_f32_to_f32(Rpp32f *srcPtr, __m128 *p)
+{
+    p[0] = _mm_loadu_ps(srcPtr);
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store4_f32_to_f32(Rpp32f *dstPtr, __m128 *p)
+{
+    _mm_storeu_ps(dstPtr, p[0]);
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load48_i8pkd3_to_f32pln3(Rpp8s *srcPtr, __m128 *p)
+{
+    __m128i px[8];
+    __m128i pxMask = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 12, 13, 14, 15);
+    __m128i pxZero = _mm_setzero_si128();
+    __m128i pxConvertI8 = _mm_set1_epi8((char)128);
+
+    px[0] = _mm_loadu_si128((__m128i *)srcPtr);           /* load [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|R05|G05|B05|R06] - Need RGB 01-04 */
+    px[1] = _mm_loadu_si128((__m128i *)(srcPtr + 12));    /* load [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|R09|G09|B09|R10] - Need RGB 05-08 */
+    px[2] = _mm_loadu_si128((__m128i *)(srcPtr + 24));    /* load [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|R13|G13|B13|R14] - Need RGB 09-12 \ */
+    px[3] = _mm_loadu_si128((__m128i *)(srcPtr + 36));    /* load [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|R17|G17|B17|R18] - Need RGB 13-16 \ */
+    px[0] = _mm_shuffle_epi8(px[0], pxMask);    /* shuffle to get [R01|R02|R03|R04|G01|G02|G03|G04 || B01|B02|B03|B04|R05|G05|B05|R06] - Need R01-04, G01-04, B01-04 */
+    px[1] = _mm_shuffle_epi8(px[1], pxMask);    /* shuffle to get [R05|R06|R07|R08|G05|G06|G07|G08 || B05|B06|B07|B08|R09|G09|B09|R10] - Need R05-08, G05-08, B05-08 */
+    px[2] = _mm_shuffle_epi8(px[2], pxMask);    /* shuffle to get [R09|R10|R11|R12|G09|G10|G11|G12 || B09|B10|B11|B12|R13|G13|B13|R14] - Need R09-12, G09-12, B09-12 */
+    px[3] = _mm_shuffle_epi8(px[3], pxMask);    /* shuffle to get [R13|R14|R15|R16|G13|G14|G15|G16 || B13|B14|B15|B16|R17|G17|B17|R18] - Need R13-16, G13-16, B13-16 */
+    px[0] = _mm_add_epi8(px[0], pxConvertI8);    /* convert to u8 for px0 compute */
+    px[1] = _mm_add_epi8(px[1], pxConvertI8);    /* convert to u8 for px1 compute */
+    px[2] = _mm_add_epi8(px[2], pxConvertI8);    /* convert to u8 for px2 compute */
+    px[3] = _mm_add_epi8(px[3], pxConvertI8);    /* convert to u8 for px3 compute */
+    px[4] = _mm_unpackhi_epi8(px[0], pxZero);    /* unpack 8 hi-pixels of px[0] */
+    px[5] = _mm_unpackhi_epi8(px[1], pxZero);    /* unpack 8 hi-pixels of px[1] */
+    px[6] = _mm_unpackhi_epi8(px[2], pxZero);    /* unpack 8 hi-pixels of px[2] */
+    px[7] = _mm_unpackhi_epi8(px[3], pxZero);    /* unpack 8 hi-pixels of px[3] */
+    px[0] = _mm_unpacklo_epi8(px[0], pxZero);    /* unpack 8 lo-pixels of px[0] */
+    px[1] = _mm_unpacklo_epi8(px[1], pxZero);    /* unpack 8 lo-pixels of px[1] */
+    px[2] = _mm_unpacklo_epi8(px[2], pxZero);    /* unpack 8 lo-pixels of px[2] */
+    px[3] = _mm_unpacklo_epi8(px[3], pxZero);    /* unpack 8 lo-pixels of px[3] */
+    p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero));    /* unpack 4 lo-pixels of px[0] - Contains R01-04 */
+    p[1] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero));    /* unpack 4 lo-pixels of px[1] - Contains R05-08 */
+    p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero));    /* unpack 4 lo-pixels of px[2] - Contains R09-12 */
+    p[3] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero));    /* unpack 4 lo-pixels of px[3] - Contains R13-16 */
+    p[4] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero));    /* unpack 4 hi-pixels of px[0] - Contains G01-04 */
+    p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero));    /* unpack 4 hi-pixels of px[1] - Contains G05-08 */
+    p[6] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero));    /* unpack 4 hi-pixels of px[2] - Contains G09-12 */
+    p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero));    /* unpack 4 hi-pixels of px[3] - Contains G13-16 */
+    p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero));    /* unpack 4 lo-pixels of px[4] - Contains B01-04 */
+    p[9] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero));    /* unpack 4 lo-pixels of px[5] - Contains B05-08 */
+    p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[6], pxZero));    /* unpack 4 lo-pixels of px[6] - Contains B09-12 */
+    p[11] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[7], pxZero));    /* unpack 4 lo-pixels of px[7] - Contains B13-16 */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store48_f32pln3_to_i8pln3(Rpp8s *dstPtrR, Rpp8s *dstPtrG, Rpp8s *dstPtrB, __m128 *p)
+{
+    __m128i px[8];
+    __m128i pxConvertI8 = _mm_set1_epi8((char)128);
+
+    px[4] = _mm_cvtps_epi32(p[0]);    /* convert to int32 for R */
+    px[5] = _mm_cvtps_epi32(p[1]);    /* convert to int32 for R */
+    px[6] = _mm_cvtps_epi32(p[2]);    /* convert to int32 for R */
+    px[7] = _mm_cvtps_epi32(p[3]);    /* convert to int32 for R */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 for R */
+    px[5] = _mm_packus_epi32(px[6], px[7]);    /* pack pixels 8-15 for R */
+    px[0] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 for R */
+    px[4] = _mm_cvtps_epi32(p[4]);    /* convert to int32 for G */
+    px[5] = _mm_cvtps_epi32(p[5]);    /* convert to int32 for G */
+    px[6] = _mm_cvtps_epi32(p[6]);    /* convert to int32 for G */
+    px[7] = _mm_cvtps_epi32(p[7]);    /* convert to int32 for G */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 for G */
+    px[5] = _mm_packus_epi32(px[6], px[7]);    /* pack pixels 8-15 for G */
+    px[1] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 for G */
+    px[4] = _mm_cvtps_epi32(p[8]);    /* convert to int32 for B */
+    px[5] = _mm_cvtps_epi32(p[9]);    /* convert to int32 for B */
+    px[6] = _mm_cvtps_epi32(p[10]);    /* convert to int32 for B */
+    px[7] = _mm_cvtps_epi32(p[11]);    /* convert to int32 for B */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 for B */
+    px[5] = _mm_packus_epi32(px[6], px[7]);    /* pack pixels 8-15 for B */
+    px[2] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 for B */
+    px[0] = _mm_sub_epi8(px[0], pxConvertI8);    /* convert back to i8 for px0 store */
+    px[1] = _mm_sub_epi8(px[1], pxConvertI8);    /* convert back to i8 for px1 store */
+    px[2] = _mm_sub_epi8(px[2], pxConvertI8);    /* convert back to i8 for px2 store */
+    _mm_storeu_si128((__m128i *)dstPtrR, px[0]);    /* store [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+    _mm_storeu_si128((__m128i *)dstPtrG, px[1]);    /* store [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+    _mm_storeu_si128((__m128i *)dstPtrB, px[2]);    /* store [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load48_i8pln3_to_f32pln3(Rpp8s *srcPtrR, Rpp8s *srcPtrG, Rpp8s *srcPtrB, __m128 *p)
+{
+    __m128i px[6];
+    __m128i pxZero = _mm_setzero_si128();
+    __m128i pxConvertI8 = _mm_set1_epi8((char)128);
+
+    px[0] = _mm_loadu_si128((__m128i *)srcPtrR);    /* load [R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16] */
+    px[1] = _mm_loadu_si128((__m128i *)srcPtrG);    /* load [G01|G02|G03|G04|G05|G06|G07|G08|G09|G10|G11|G12|G13|G14|G15|G16] */
+    px[2] = _mm_loadu_si128((__m128i *)srcPtrB);    /* load [B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16] */
+    px[0] = _mm_add_epi8(px[0], pxConvertI8);    /* convert to u8 for px0 compute */
+    px[1] = _mm_add_epi8(px[1], pxConvertI8);    /* convert to u8 for px1 compute */
+    px[2] = _mm_add_epi8(px[2], pxConvertI8);    /* convert to u8 for px2 compute */
+    px[3] = _mm_unpackhi_epi8(px[0], pxZero);    /* unpack 8 hi-pixels of px[0] */
+    px[4] = _mm_unpackhi_epi8(px[1], pxZero);    /* unpack 8 hi-pixels of px[1] */
+    px[5] = _mm_unpackhi_epi8(px[2], pxZero);    /* unpack 8 hi-pixels of px[2] */
+    px[0] = _mm_unpacklo_epi8(px[0], pxZero);    /* unpack 8 lo-pixels of px[0] */
+    px[1] = _mm_unpacklo_epi8(px[1], pxZero);    /* unpack 8 lo-pixels of px[1] */
+    px[2] = _mm_unpacklo_epi8(px[2], pxZero);    /* unpack 8 lo-pixels of px[2] */
+    p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero));    /* pixels 0-3 of original px[0] containing 16 R values */
+    p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero));    /* pixels 4-7 of original px[0] containing 16 R values */
+    p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[3], pxZero));    /* pixels 8-11 of original px[0] containing 16 R values */
+    p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[3], pxZero));    /* pixels 12-15 of original px[0] containing 16 R values */
+    p[4] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero));    /* pixels 0-3 of original px[1] containing 16 G values */
+    p[5] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero));    /* pixels 4-7 of original px[1] containing 16 G values */
+    p[6] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[4], pxZero));    /* pixels 8-11 of original px[1] containing 16 G values */
+    p[7] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[4], pxZero));    /* pixels 12-15 of original px[1] containing 16 G values */
+    p[8] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[2], pxZero));    /* pixels 0-3 of original px[1] containing 16 B values */
+    p[9] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[2], pxZero));    /* pixels 4-7 of original px[1] containing 16 B values */
+    p[10] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[5], pxZero));    /* pixels 8-11 of original px[1] containing 16 B values */
+    p[11] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[5], pxZero));    /* pixels 12-15 of original px[1] containing 16 B values */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store48_f32pln3_to_i8pkd3(Rpp8s *dstPtr, __m128 *p)
+{
+    __m128i px[7];
+    __m128i pxMask = _mm_setr_epi8(0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 12, 13, 14, 15);
+    __m128i pxConvertI8 = _mm_set1_epi8((char)128);
+    __m128 pZero = _mm_setzero_ps();
+
+    px[4] = _mm_cvtps_epi32(p[0]);    /* convert to int32 for R01-04 */
+    px[5] = _mm_cvtps_epi32(p[4]);    /* convert to int32 for G01-04 */
+    px[6] = _mm_cvtps_epi32(p[8]);    /* convert to int32 for B01-04 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R01-04|G01-04 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B01-04|X01-04 */
+    px[0] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R01|R02|R03|R04|G01|G02|G03|G04|B01|B02|B03|B04|00|00|00|00] */
+    px[4] = _mm_cvtps_epi32(p[1]);    /* convert to int32 for R05-08 */
+    px[5] = _mm_cvtps_epi32(p[5]);    /* convert to int32 for G05-08 */
+    px[6] = _mm_cvtps_epi32(p[9]);    /* convert to int32 for B05-08 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R05-08|G05-08 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B05-08|X01-04 */
+    px[1] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R05|R06|R07|R08|G05|G06|G07|G08|B05|B06|B07|B08|00|00|00|00] */
+    px[4] = _mm_cvtps_epi32(p[2]);    /* convert to int32 for R09-12 */
+    px[5] = _mm_cvtps_epi32(p[6]);    /* convert to int32 for G09-12 */
+    px[6] = _mm_cvtps_epi32(p[10]);    /* convert to int32 for B09-12 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R09-12|G09-12 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B09-12|X01-04 */
+    px[2] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R09|R10|R11|R12|G09|G10|G11|G12|B09|B10|B11|B12|00|00|00|00] */
+    px[4] = _mm_cvtps_epi32(p[3]);    /* convert to int32 for R13-16 */
+    px[5] = _mm_cvtps_epi32(p[7]);    /* convert to int32 for G13-16 */
+    px[6] = _mm_cvtps_epi32(p[11]);    /* convert to int32 for B13-16 */
+    px[4] = _mm_packus_epi32(px[4], px[5]);    /* pack pixels 0-7 as R13-16|G13-16 */
+    px[5] = _mm_packus_epi32(px[6], pZero);    /* pack pixels 8-15 as B13-16|X01-04 */
+    px[3] = _mm_packus_epi16(px[4], px[5]);    /* pack pixels 0-15 as [R13|R14|R15|R16|G13|G14|G15|G16|B13|B14|B15|B16|00|00|00|00] */
+    px[0] = _mm_sub_epi8(px[0], pxConvertI8);    /* convert back to i8 for px0 store */
+    px[1] = _mm_sub_epi8(px[1], pxConvertI8);    /* convert back to i8 for px1 store */
+    px[2] = _mm_sub_epi8(px[2], pxConvertI8);    /* convert back to i8 for px2 store */
+    px[3] = _mm_sub_epi8(px[3], pxConvertI8);    /* convert back to i8 for px3 store */
+    px[0] = _mm_shuffle_epi8(px[0], pxMask);    /* shuffle to get [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+    px[1] = _mm_shuffle_epi8(px[1], pxMask);    /* shuffle to get [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+    px[2] = _mm_shuffle_epi8(px[2], pxMask);    /* shuffle to get [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+    px[3] = _mm_shuffle_epi8(px[3], pxMask);    /* shuffle to get [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)dstPtr, px[0]);           /* store [R01|G01|B01|R02|G02|B02|R03|G03|B03|R04|G04|B04|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)(dstPtr + 12), px[1]);    /* store [R05|G05|B05|R06|G06|B06|R07|G07|B07|R08|G08|B08|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)(dstPtr + 24), px[2]);    /* store [R09|G09|B09|R10|G10|B10|R11|G11|B11|R12|G12|B12|00|00|00|00] */
+    _mm_storeu_si128((__m128i *)(dstPtr + 36), px[3]);    /* store [R13|G13|B13|R14|G14|B14|R15|G15|B15|R16|G16|B16|00|00|00|00] */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_load16_i8_to_f32(Rpp8s *srcPtr, __m128 *p)
+{
+    __m128i px[2];
+    __m128i pxZero = _mm_setzero_si128();
+    __m128i pxConvertI8 = _mm_set1_epi8((char)128);
+
+    px[0] =  _mm_loadu_si128((__m128i *)srcPtr);    /* load pixels 0-15 */
+    px[0] = _mm_add_epi8(px[0], pxConvertI8);    /* convert to u8 for px0 compute */
+    px[1] = _mm_unpackhi_epi8(px[0], pxZero);    /* pixels 8-15 */
+    px[0] = _mm_unpacklo_epi8(px[0], pxZero);    /* pixels 0-7 */
+    p[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[0], pxZero));    /* pixels 0-3 */
+    p[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[0], pxZero));    /* pixels 4-7 */
+    p[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(px[1], pxZero));    /* pixels 8-11 */
+    p[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(px[1], pxZero));    /* pixels 12-15 */
+
+    return RPP_SUCCESS;
+}
+
+inline RppStatus rpp_store16_f32_to_i8(Rpp8s *dstPtr, __m128 *p)
+{
+    __m128i px[4];
+    __m128i pxConvertI8 = _mm_set1_epi8((char)128);
+
+    px[0] = _mm_cvtps_epi32(p[0]);    /* pixels 0-3 */
+    px[1] = _mm_cvtps_epi32(p[1]);    /* pixels 4-7 */
+    px[2] = _mm_cvtps_epi32(p[2]);    /* pixels 8-11 */
+    px[3] = _mm_cvtps_epi32(p[3]);    /* pixels 12-15 */
+    px[0] = _mm_packus_epi32(px[0], px[1]);    /* pixels 0-7 */
+    px[1] = _mm_packus_epi32(px[2], px[3]);    /* pixels 8-15 */
+    px[0] = _mm_packus_epi16(px[0], px[1]);    /* pixels 0-15 */
+    px[0] = _mm_sub_epi8(px[0], pxConvertI8);    /* convert back to i8 for px0 store */
+    _mm_storeu_si128((__m128i *)dstPtr, px[0]);    /* store pixels 0-15 */
+
+    return RPP_SUCCESS;
+}
+
+template <typename FuncType, typename... ArgTypes>
+inline RppStatus rpp_simd_load(FuncType &&rpp_simd_load_routine, ArgTypes&&... args)
+{
+    std::forward<FuncType>(rpp_simd_load_routine)(std::forward<ArgTypes>(args)...);
+
+    return RPP_SUCCESS;
+}
+
+template <typename FuncType, typename... ArgTypes>
+inline RppStatus rpp_simd_store(FuncType &&rpp_simd_store_routine, ArgTypes&&... args)
+{
+    std::forward<FuncType>(rpp_simd_store_routine)(std::forward<ArgTypes>(args)...);
+
+    return RPP_SUCCESS;
+}
+
 // Shuffle floats in `src` by using SSE2 `pshufd` instead of `shufps`, if possible.
 #define SIMD_SHUFFLE_PS(src, imm) \
-  _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(src), imm))
+    _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(src), imm))
+
+#define CHECK_SIMD  0
+#define FP_BITS     16
+#define FP_MUL      (1<<FP_BITS)
 
 SIMD_CONST_PI(full       , 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 SIMD_CONST_PI(sn         , 0x80000000, 0x80000000, 0x80000000, 0x80000000);
@@ -82,49 +528,44 @@ inline __m128i _mm_mullo_epi8(__m128i a, __m128i b)
 
 inline void _mm_print_epi8(__m128i vPrintArray)
 {
-  char printArray[16];
-  _mm_storeu_si128((__m128i *)printArray, vPrintArray);
-  printf("\n");
-  for (int ct = 0; ct < 16; ct++)
-  {
-      printf("%d ", printArray[ct]);
-  }
+    char printArray[16];
+    _mm_storeu_si128((__m128i *)printArray, vPrintArray);
+    printf("\n");
+    for (int ct = 0; ct < 16; ct++)
+    {
+        printf("%d ", printArray[ct]);
+    }
 }
 
 inline void _mm_print_epi32(__m128i vPrintArray)
 {
-  int printArray[4];
-  _mm_storeu_si128((__m128i *)printArray, vPrintArray);
-  printf("\n");
-  for (int ct = 0; ct < 4; ct++)
-  {
-      printf("%d ", printArray[ct]);
-  }
+    int printArray[4];
+    _mm_storeu_si128((__m128i *)printArray, vPrintArray);
+    printf("\n");
+    for (int ct = 0; ct < 4; ct++)
+    {
+        printf("%d ", printArray[ct]);
+    }
 }
 
 inline void _mm_print_ps(__m128 vPrintArray)
 {
-  float printArray[4];
-  _mm_storeu_ps(printArray, vPrintArray);
-  printf("\n");
-  for (int ct = 0; ct < 4; ct++)
-  {
-      printf("%0.6f ", printArray[ct]);
-  }
+    float printArray[4];
+    _mm_storeu_ps(printArray, vPrintArray);
+    printf("\n");
+    for (int ct = 0; ct < 4; ct++)
+    {
+        printf("%0.6f ", printArray[ct]);
+    }
 }
 
-#define CHECK_SIMD  0
-#define FP_BITS     16
-#define FP_MUL      (1<<FP_BITS)
-
-
 static inline Rpp32u HorMin(__m128i pmin)
 {
     pmin = _mm_min_epu8(pmin, _mm_shuffle_epi32(pmin, _MM_SHUFFLE(3, 2, 3, 2)));
     pmin = _mm_min_epu8(pmin, _mm_shuffle_epi32(pmin, _MM_SHUFFLE(1, 1, 1, 1)));
     pmin = _mm_min_epu8(pmin, _mm_shufflelo_epi16(pmin, _MM_SHUFFLE(1, 1, 1, 1)));
     pmin = _mm_min_epu8(pmin, _mm_srli_epi16(pmin, 8));
-    return (_mm_cvtsi128_si32(pmin) & 0x000000FF);    
+    return (_mm_cvtsi128_si32(pmin) & 0x000000FF);
 }
 
 static inline Rpp32u HorMax(__m128i pmax)
@@ -156,12 +597,10 @@ static inline Rpp32u HorMax256(__m256i pmax)
     pmax_128 = M256I(pmax).m256i_i128[0];
     pmax_128 = _mm_max_epi8(pmax_128, _mm_shufflelo_epi16(pmax_128, _MM_SHUFFLE(1, 1, 1, 1)));
     pmax_128 = _mm_max_epi8(pmax_128, _mm_srli_epi16(pmax_128, 8));
-    return (_mm_cvtsi128_si32(pmax_128) & 0x000000FF);    
+    return (_mm_cvtsi128_si32(pmax_128) & 0x000000FF);
 }
 #endif
 
-#define __SSE4_1__ 1
-
 static  inline __m128 fast_exp_sse (__m128 x)
 {
     __m128 t, f, e, p, r;
@@ -171,7 +610,7 @@ static  inline __m128 fast_exp_sse (__m128 x)
     __m128 c1  = _mm_set1_ps (0.657636276f);
     __m128 c2  = _mm_set1_ps (1.00172476f);
 
-    /* exp(x) = 2^i * 2^f; i = floor (log2(e) * x), 0 <= f <= 1 */   
+    /* exp(x) = 2^i * 2^f; i = floor (log2(e) * x), 0 <= f <= 1 */
     t = _mm_mul_ps (x, l2e);             /* t = log2(e) * x */
 #ifdef __SSE4_1__
     e = _mm_floor_ps (t);                /* floor(t) */
@@ -221,92 +660,92 @@ static const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
 static const __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
 static const __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / M_PI
 
-static inline void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
+static inline void sincos_ps(__m128 x, __m128 *s, __m128 *c)
+{
 
 #if 0
 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1
-  // XXX Added in MathGeoLib: Take a modulo of the input in 2pi to try to enhance the precision with large input values.
-  x = modf_ps(x, _mm_set1_ps(2.f*3.141592654f));
+     // XXX Added in MathGeoLib: Take a modulo of the input in 2pi to try to enhance the precision with large input values.
+    x = modf_ps(x, _mm_set1_ps(2.f*3.141592654f));
 #endif
 #endif
 
-  /* extract the sign bit (upper one) */
-  __m128 sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
-  /* take the absolute value */
-  x = _mm_xor_ps(x, sign_bit_sin);
-  
-  /* scale by 4/Pi */
-  __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);
-    
-  /* store the integer part of y in emm2 */
-  __m128i emm2 = _mm_cvttps_epi32(y);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, _pi32_1);
-  emm2 = _mm_and_si128(emm2, _pi32_inv1);
-  y = _mm_cvtepi32_ps(emm2);
-
-  __m128i emm4 = emm2;
-
-  /* get the swap sign flag for the sine */
-  __m128i emm0 = _mm_and_si128(emm2, _pi32_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
-
-  /* get the polynom selection mask for the sine*/
-  emm2 = _mm_and_si128(emm2, _pi32_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-  __m128 poly_mask = _mm_castsi128_ps(emm2);
-  /* The magic pass: "Extended precision modular arithmetic" 
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
-  __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
-  __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
-  x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
-
-  emm4 = _mm_sub_epi32(emm4, _pi32_2);
-  emm4 = _mm_andnot_si128(emm4, _pi32_4);
-  emm4 = _mm_slli_epi32(emm4, 29);
-  __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
-
-  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-  
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  __m128 z = _mm_mul_ps(x,x);
-  y = _ps_coscof_p0;
-
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p1);
-  y = _mm_mul_ps(y, z);
-  y = _mm_add_ps(y, _ps_coscof_p2);
-  y = _mm_mul_ps(y, _mm_mul_ps(z, z));
-  __m128 tmp = _mm_mul_ps(z, _ps_0p5);
-  y = _mm_sub_ps(y, tmp);
-  y = _mm_add_ps(y, _ps_1);
-  
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  __m128 y2 = _ps_sincof_p0;
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p1);
-  y2 = _mm_mul_ps(y2, z);
-  y2 = _mm_add_ps(y2, _ps_sincof_p2);
-  y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
-  y2 = _mm_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */  
-  xmm3 = poly_mask;
-  __m128 ysin2 = _mm_and_ps(xmm3, y2);
-  __m128 ysin1 = _mm_andnot_ps(xmm3, y);
-  y2 = _mm_sub_ps(y2,ysin2);
-  y = _mm_sub_ps(y, ysin1);
-
-  xmm1 = _mm_add_ps(ysin1,ysin2);
-  xmm2 = _mm_add_ps(y,y2);
- 
-  /* update the sign */
-  *s = _mm_xor_ps(xmm1, sign_bit_sin);
-  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+    // Extract the sign bit (upper one)
+    __m128 sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
+    // take the absolute value
+    x = _mm_xor_ps(x, sign_bit_sin);
+
+    // Scale by 4/Pi
+    __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);
+
+    // Store the integer part of y in emm2
+    __m128i emm2 = _mm_cvttps_epi32(y);
+
+    // j=(j+1) & (~1) (see the cephes sources)
+    emm2 = _mm_add_epi32(emm2, _pi32_1);
+    emm2 = _mm_and_si128(emm2, _pi32_inv1);
+    y = _mm_cvtepi32_ps(emm2);
+
+    __m128i emm4 = emm2;
+
+    // Get the swap sign flag for the sine
+    __m128i emm0 = _mm_and_si128(emm2, _pi32_4);
+    emm0 = _mm_slli_epi32(emm0, 29);
+    __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+    // Get the polynom selection mask for the sine
+    emm2 = _mm_and_si128(emm2, _pi32_2);
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    __m128 poly_mask = _mm_castsi128_ps(emm2);
+    // The magic pass: "Extended precision modular arithmetic - x = ((x - y * DP1) - y * DP2) - y * DP3;
+    __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
+    __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
+    __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
+    x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
+
+    emm4 = _mm_sub_epi32(emm4, _pi32_2);
+    emm4 = _mm_andnot_si128(emm4, _pi32_4);
+    emm4 = _mm_slli_epi32(emm4, 29);
+    __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
+
+    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+    // Evaluate the first polynom  (0 <= x <= Pi/4)
+    __m128 z = _mm_mul_ps(x,x);
+    y = _ps_coscof_p0;
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, _ps_coscof_p1);
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, _ps_coscof_p2);
+    y = _mm_mul_ps(y, _mm_mul_ps(z, z));
+    __m128 tmp = _mm_mul_ps(z, _ps_0p5);
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, _ps_1);
+
+    // Evaluate the second polynom  (Pi/4 <= x <= 0)
+
+    __m128 y2 = _ps_sincof_p0;
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, _ps_sincof_p1);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, _ps_sincof_p2);
+    y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
+    y2 = _mm_add_ps(y2, x);
+
+    // Select the correct result from the two polynoms
+    xmm3 = poly_mask;
+    __m128 ysin2 = _mm_and_ps(xmm3, y2);
+    __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+    y2 = _mm_sub_ps(y2,ysin2);
+    y = _mm_sub_ps(y, ysin1);
+
+    xmm1 = _mm_add_ps(ysin1,ysin2);
+    xmm2 = _mm_add_ps(y,y2);
+
+    // Update the sign
+    *s = _mm_xor_ps(xmm1, sign_bit_sin);
+    *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
 
 static const __m128 _ps_atanrange_hi = _mm_set1_ps(2.414213562373095);
@@ -322,111 +761,110 @@ static const __m128 _ps_atancof_p3 = _mm_set1_ps(3.33329491539E-1);
 
 static inline __m128 atan_ps( __m128 x )
 {
-	__m128 sign_bit, y;
-
-	sign_bit = x;
-	/* take the absolute value */
-	x = _mm_and_ps( x, _ps_inv_sign_mask );
-	/* extract the sign bit (upper one) */
-	sign_bit = _mm_and_ps( sign_bit, _ps_sign_mask );
-
-/* range reduction, init x and y depending on range */
-
-	/* x > 2.414213562373095 */
-	__m128 cmp0 = _mm_cmpgt_ps( x, _ps_atanrange_hi );
-	/* x > 0.4142135623730950 */
-	__m128 cmp1 = _mm_cmpgt_ps( x, _ps_atanrange_lo );
-
-	/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
-	__m128 cmp2 = _mm_andnot_ps( cmp0, cmp1 );
-
-	/* -( 1.0/x ) */
-	__m128 y0 = _mm_and_ps( cmp0, _ps_cephes_PIO2F );
-	__m128 x0 = _mm_div_ps( _ps_1, x );
-	x0 = _mm_xor_ps( x0, _ps_sign_mask );
-
-	__m128 y1 = _mm_and_ps( cmp2, _ps_cephes_PIO4F );
-	/* (x-1.0)/(x+1.0) */
-	__m128 x1_o = _mm_sub_ps( x, _ps_1 );
-	__m128 x1_u = _mm_add_ps( x, _ps_1 );
-	__m128 x1 = _mm_div_ps( x1_o, x1_u );
-
-	__m128 x2 = _mm_and_ps( cmp2, x1 );
-	x0 = _mm_and_ps( cmp0, x0 );
-	x2 = _mm_or_ps( x2, x0 );
-	cmp1 = _mm_or_ps( cmp0, cmp2 );
-	x2 = _mm_and_ps( cmp1, x2 );
-	x = _mm_andnot_ps( cmp1, x );
-	x = _mm_or_ps( x2, x );
-
-	y = _mm_or_ps( y0, y1 );
-
-	__m128 zz = _mm_mul_ps( x, x );
-	__m128 acc = _ps_atancof_p0;
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_sub_ps( acc, _ps_atancof_p1 );
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_add_ps( acc, _ps_atancof_p2 );
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_sub_ps( acc, _ps_atancof_p3 );
-	acc = _mm_mul_ps( acc, zz );
-	acc = _mm_mul_ps( acc, x );
-	acc = _mm_add_ps( acc, x );
-	y = _mm_add_ps( y, acc );
-
-	/* update the sign */
-	y = _mm_xor_ps( y, sign_bit );
-
-	return y;
+    __m128 sign_bit, y;
+
+    sign_bit = x;
+    // Take the absolute value
+    x = _mm_and_ps( x, _ps_inv_sign_mask );
+    // Extract the sign bit (upper one)
+    sign_bit = _mm_and_ps( sign_bit, _ps_sign_mask );
+
+    // Range reduction, init x and y depending on range
+
+    // x > 2.414213562373095
+    __m128 cmp0 = _mm_cmpgt_ps( x, _ps_atanrange_hi );
+    // x > 0.4142135623730950
+    __m128 cmp1 = _mm_cmpgt_ps( x, _ps_atanrange_lo );
+
+    // x > 0.4142135623730950 && !( x > 2.414213562373095 )
+    __m128 cmp2 = _mm_andnot_ps( cmp0, cmp1 );
+
+    // -( 1.0/x )
+    __m128 y0 = _mm_and_ps( cmp0, _ps_cephes_PIO2F );
+    __m128 x0 = _mm_div_ps( _ps_1, x );
+    x0 = _mm_xor_ps( x0, _ps_sign_mask );
+
+    __m128 y1 = _mm_and_ps( cmp2, _ps_cephes_PIO4F );
+    // (x-1.0)/(x+1.0)
+    __m128 x1_o = _mm_sub_ps( x, _ps_1 );
+    __m128 x1_u = _mm_add_ps( x, _ps_1 );
+    __m128 x1 = _mm_div_ps( x1_o, x1_u );
+
+    __m128 x2 = _mm_and_ps( cmp2, x1 );
+    x0 = _mm_and_ps( cmp0, x0 );
+    x2 = _mm_or_ps( x2, x0 );
+    cmp1 = _mm_or_ps( cmp0, cmp2 );
+    x2 = _mm_and_ps( cmp1, x2 );
+    x = _mm_andnot_ps( cmp1, x );
+    x = _mm_or_ps( x2, x );
+
+    y = _mm_or_ps( y0, y1 );
+
+    __m128 zz = _mm_mul_ps( x, x );
+    __m128 acc = _ps_atancof_p0;
+    acc = _mm_mul_ps( acc, zz );
+    acc = _mm_sub_ps( acc, _ps_atancof_p1 );
+    acc = _mm_mul_ps( acc, zz );
+    acc = _mm_add_ps( acc, _ps_atancof_p2 );
+    acc = _mm_mul_ps( acc, zz );
+    acc = _mm_sub_ps( acc, _ps_atancof_p3 );
+    acc = _mm_mul_ps( acc, zz );
+    acc = _mm_mul_ps( acc, x );
+    acc = _mm_add_ps( acc, x );
+    y = _mm_add_ps( y, acc );
+
+    // Update the sign
+    y = _mm_xor_ps( y, sign_bit );
+
+    return y;
 }
 
 static inline __m128 atan2_ps( __m128 y, __m128 x )
 {
-	__m128 x_eq_0 = _mm_cmpeq_ps( x, _ps_0 );
-	__m128 x_gt_0 = _mm_cmpgt_ps( x, _ps_0 );
-	__m128 x_le_0 = _mm_cmple_ps( x, _ps_0 );
-	__m128 y_eq_0 = _mm_cmpeq_ps( y, _ps_0 );
-	__m128 x_lt_0 = _mm_cmplt_ps( x, _ps_0 );
-	__m128 y_lt_0 = _mm_cmplt_ps( y, _ps_0 );
-
-	__m128 zero_mask = _mm_and_ps( x_eq_0, y_eq_0 );
-	__m128 zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 );
-	zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case );
-
-	__m128 pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 );
-	__m128 pio2_mask_sign = _mm_and_ps( y_lt_0, _ps_sign_mask );
-	__m128 pio2_result = _ps_cephes_PIO2F;
-	pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign );
-	pio2_result = _mm_and_ps( pio2_mask, pio2_result );
-
-	__m128 pi_mask = _mm_and_ps( y_eq_0, x_le_0 );
-	__m128 pi = _ps_cephes_PIF;
-	__m128 pi_result = _mm_and_ps( pi_mask, pi );
-
-	__m128 swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 );
-	swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, _ps_sign_mask );
-
-	__m128 offset0 = _mm_setzero_ps();
-	__m128 offset1 = _ps_cephes_PIF;
-	offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset );
-
-	__m128 offset = _mm_andnot_ps( x_lt_0, offset0 );
-	offset = _mm_and_ps( x_lt_0, offset1 );
-
-	__m128 arg = _mm_div_ps( y, x );
-	__m128 atan_result = atan_ps( arg );
-	atan_result = _mm_add_ps( atan_result, offset );
-
-	/* select between zero_result, pio2_result and atan_result */
-
-	__m128 result = _mm_andnot_ps( zero_mask, pio2_result );
-	atan_result = _mm_andnot_ps( pio2_mask, atan_result );
-	atan_result = _mm_andnot_ps( pio2_mask, atan_result);
-	result = _mm_or_ps( result, atan_result );
-	result = _mm_or_ps( result, pi_result );
-
-	return result;
+    __m128 x_eq_0 = _mm_cmpeq_ps( x, _ps_0 );
+    __m128 x_gt_0 = _mm_cmpgt_ps( x, _ps_0 );
+    __m128 x_le_0 = _mm_cmple_ps( x, _ps_0 );
+    __m128 y_eq_0 = _mm_cmpeq_ps( y, _ps_0 );
+    __m128 x_lt_0 = _mm_cmplt_ps( x, _ps_0 );
+    __m128 y_lt_0 = _mm_cmplt_ps( y, _ps_0 );
+
+    __m128 zero_mask = _mm_and_ps( x_eq_0, y_eq_0 );
+    __m128 zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 );
+    zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case );
+
+    __m128 pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 );
+    __m128 pio2_mask_sign = _mm_and_ps( y_lt_0, _ps_sign_mask );
+    __m128 pio2_result = _ps_cephes_PIO2F;
+    pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign );
+    pio2_result = _mm_and_ps( pio2_mask, pio2_result );
+
+    __m128 pi_mask = _mm_and_ps( y_eq_0, x_le_0 );
+    __m128 pi = _ps_cephes_PIF;
+    __m128 pi_result = _mm_and_ps( pi_mask, pi );
+
+    __m128 swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 );
+    swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, _ps_sign_mask );
+
+    __m128 offset0 = _mm_setzero_ps();
+    __m128 offset1 = _ps_cephes_PIF;
+    offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset );
+
+    __m128 offset = _mm_andnot_ps( x_lt_0, offset0 );
+    offset = _mm_and_ps( x_lt_0, offset1 );
+
+    __m128 arg = _mm_div_ps( y, x );
+    __m128 atan_result = atan_ps( arg );
+    atan_result = _mm_add_ps( atan_result, offset );
+
+    // Select between zero_result, pio2_result and atan_result
+
+    __m128 result = _mm_andnot_ps( zero_mask, pio2_result );
+    atan_result = _mm_andnot_ps( pio2_mask, atan_result );
+    atan_result = _mm_andnot_ps( pio2_mask, atan_result);
+    result = _mm_or_ps( result, atan_result );
+    result = _mm_or_ps( result, pi_result );
+
+    return result;
 }
 
-#endif
 #endif //AMD_RPP_RPP_CPU_SIMD_HPP
\ No newline at end of file
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index 0e6fbe06e..8394f0747 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -5,11 +5,58 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_runtime.h>
 #include <hip/hip_ext.h>
+#include <hip/hip_fp16.h>
 #include <rppdefs.h>
 #include <vector>
 #include <half.hpp>
-using half_float::half;
-typedef half Rpp16f;
+using halfhpp = half_float::half;
+typedef halfhpp Rpp16f;
+
+typedef struct d_float8
+{
+    float4 x;
+    float4 y;
+} d_float8;
+
+typedef struct d_float24
+{
+    d_float8 x;
+    d_float8 y;
+    d_float8 z;
+} d_float24;
+
+typedef struct d_uint6
+{
+    uint2 x;
+    uint2 y;
+    uint2 z;
+} d_uint6;
+
+typedef struct d_int6
+{
+    int2 x;
+    int2 y;
+    int2 z;
+} d_int6;
+
+typedef struct d_half4
+{
+    half2 x;
+    half2 y;
+} d_half4;
+
+typedef struct d_half8
+{
+    d_half4 x;
+    d_half4 y;
+} d_half8;
+
+typedef struct d_half24
+{
+    d_half8 x;
+    d_half8 y;
+    d_half8 z;
+} d_half24;
 
 enum class RPPTensorDataType
 {
@@ -44,6 +91,19 @@ struct RPPTensorFunctionMetaData
     }
 };
 
+#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
+#define BYTE_TO_BINARY(byte)  \
+  (byte & 0x80 ? '1' : '0'), \
+  (byte & 0x40 ? '1' : '0'), \
+  (byte & 0x20 ? '1' : '0'), \
+  (byte & 0x10 ? '1' : '0'), \
+  (byte & 0x08 ? '1' : '0'), \
+  (byte & 0x04 ? '1' : '0'), \
+  (byte & 0x02 ? '1' : '0'), \
+  (byte & 0x01 ? '1' : '0')
+
+/******************** HOST FUNCTIONS ********************/
+
 inline int getplnpkdind(RppiChnFormat &format)
 {
     return format == RPPI_CHN_PLANAR ? 1 : 3;
@@ -72,4 +132,488 @@ inline RppStatus generate_gaussian_kernel_gpu(Rpp32f stdDev, Rpp32f* kernel, Rpp
 
     return RPP_SUCCESS;
 }
+
+/******************** DEVICE FUNCTIONS ********************/
+
+// -------------------- Set 1 - Packing --------------------
+
+// Packing to U8s
+
+__device__ __forceinline__ uint rpp_hip_pack(float4 src)
+{
+    return __builtin_amdgcn_cvt_pk_u8_f32(src.w, 3,
+           __builtin_amdgcn_cvt_pk_u8_f32(src.z, 2,
+           __builtin_amdgcn_cvt_pk_u8_f32(src.y, 1,
+           __builtin_amdgcn_cvt_pk_u8_f32(src.x, 0, 0))));
+}
+
+// Packing to I8s
+
+__device__ __forceinline__ uint rpp_hip_pack_i8(float4 src)
+{
+    char4 dst_c4;
+    dst_c4.w = (signed char)(src.w);
+    dst_c4.z = (signed char)(src.z);
+    dst_c4.y = (signed char)(src.y);
+    dst_c4.x = (signed char)(src.x);
+
+    return *(uint *)&dst_c4;
+}
+
+// -------------------- Set 2 - Un-Packing --------------------
+
+// Un-Packing from U8s
+
+__device__ __forceinline__ float rpp_hip_unpack0(uint src)
+{
+    return (float)(src & 0xFF);
+}
+
+__device__ __forceinline__ float rpp_hip_unpack1(uint src)
+{
+    return (float)((src >> 8) & 0xFF);
+}
+
+__device__ __forceinline__ float rpp_hip_unpack2(uint src)
+{
+    return (float)((src >> 16) & 0xFF);
+}
+
+__device__ __forceinline__ float rpp_hip_unpack3(uint src)
+{
+    return (float)((src >> 24) & 0xFF);
+}
+
+__device__ __forceinline__ float4 rpp_hip_unpack(uint src)
+{
+    return make_float4(rpp_hip_unpack0(src), rpp_hip_unpack1(src), rpp_hip_unpack2(src), rpp_hip_unpack3(src));
+}
+
+// Un-Packing from I8s
+
+__device__ __forceinline__ float rpp_hip_unpack0(int src)
+{
+    return (float)(signed char)(src & 0xFF);
+}
+
+__device__ __forceinline__ float rpp_hip_unpack1(int src)
+{
+    return (float)(signed char)((src >> 8) & 0xFF);
+}
+
+__device__ __forceinline__ float rpp_hip_unpack2(int src)
+{
+    return (float)(signed char)((src >> 16) & 0xFF);
+}
+
+__device__ __forceinline__ float rpp_hip_unpack3(int src)
+{
+    return (float)(signed char)((src >> 24) & 0xFF);
+}
+
+__device__ __forceinline__ float4 rpp_hip_unpack_from_i8(int src)
+{
+    return make_float4(rpp_hip_unpack0(src), rpp_hip_unpack1(src), rpp_hip_unpack2(src), rpp_hip_unpack3(src));
+}
+
+// -------------------- Set 3 - Loads --------------------
+
+// U8 loads without layout toggle (8 U8 pixels)
+
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(uchar *srcPtr, uint srcIdx, d_float8 *src_f8)
+{
+    uint2 src = *((uint2 *)(&srcPtr[srcIdx]));
+    src_f8->x = rpp_hip_unpack(src.x);
+    src_f8->y = rpp_hip_unpack(src.y);
+}
+
+// F32 loads without layout toggle (8 F32 pixels)
+
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(float *srcPtr, uint srcIdx, d_float8 *src_f8)
+{
+    *src_f8 = *((d_float8 *)(&srcPtr[srcIdx]));
+}
+
+// I8 loads without layout toggle (8 I8 pixels)
+
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(signed char *srcPtr, uint srcIdx, d_float8 *src_f8)
+{
+    int2 src = *((int2 *)(&srcPtr[srcIdx]));
+    src_f8->x = rpp_hip_unpack_from_i8(src.x);
+    src_f8->y = rpp_hip_unpack_from_i8(src.y);
+}
+
+// F16 loads without layout toggle (8 F16 pixels)
+
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(half *srcPtr, uint srcIdx, d_float8 *src_f8)
+{
+    d_half8 src_h8;
+    src_h8 = *((d_half8 *)(&srcPtr[srcIdx]));
+
+    float2 src1_f2, src2_f2;
+
+    src1_f2 = __half22float2(src_h8.x.x);
+    src2_f2 = __half22float2(src_h8.x.y);
+    src_f8->x = make_float4(src1_f2.x, src1_f2.y, src2_f2.x, src2_f2.y);
+
+    src1_f2 = __half22float2(src_h8.y.x);
+    src2_f2 = __half22float2(src_h8.y.y);
+    src_f8->y = make_float4(src1_f2.x, src1_f2.y, src2_f2.x, src2_f2.y);
+}
+
+// U8 loads with layout toggle PKD3 to PLN3 (24 U8 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(uchar *srcPtr, uint srcIdx, d_float24 *src_f24)
+{
+    d_uint6 src = *((d_uint6 *)(&srcPtr[srcIdx]));
+
+    src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack2(src.x.y), rpp_hip_unpack1(src.y.x));
+    src_f24->x.y = make_float4(rpp_hip_unpack0(src.y.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack2(src.z.x), rpp_hip_unpack1(src.z.y));
+
+    src_f24->y.x = make_float4(rpp_hip_unpack1(src.x.x), rpp_hip_unpack0(src.x.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack2(src.y.x));
+    src_f24->y.y = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack0(src.z.x), rpp_hip_unpack3(src.z.x), rpp_hip_unpack2(src.z.y));
+
+    src_f24->z.x = make_float4(rpp_hip_unpack2(src.x.x), rpp_hip_unpack1(src.x.y), rpp_hip_unpack0(src.y.x), rpp_hip_unpack3(src.y.x));
+    src_f24->z.y = make_float4(rpp_hip_unpack2(src.y.y), rpp_hip_unpack1(src.z.x), rpp_hip_unpack0(src.z.y), rpp_hip_unpack3(src.z.y));
+}
+
+// F32 loads with layout toggle PKD3 to PLN3 (24 F32 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(float *srcPtr, uint srcIdx, d_float24 *src_f24)
+{
+    d_float24 *srcPtr_f24;
+    srcPtr_f24 = (d_float24 *)&srcPtr[srcIdx];
+
+    src_f24->x.x.x = srcPtr_f24->x.x.x;
+    src_f24->y.x.x = srcPtr_f24->x.x.y;
+    src_f24->z.x.x = srcPtr_f24->x.x.z;
+    src_f24->x.x.y = srcPtr_f24->x.x.w;
+    src_f24->y.x.y = srcPtr_f24->x.y.x;
+    src_f24->z.x.y = srcPtr_f24->x.y.y;
+    src_f24->x.x.z = srcPtr_f24->x.y.z;
+    src_f24->y.x.z = srcPtr_f24->x.y.w;
+
+    src_f24->z.x.z = srcPtr_f24->y.x.x;
+    src_f24->x.x.w = srcPtr_f24->y.x.y;
+    src_f24->y.x.w = srcPtr_f24->y.x.z;
+    src_f24->z.x.w = srcPtr_f24->y.x.w;
+    src_f24->x.y.x = srcPtr_f24->y.y.x;
+    src_f24->y.y.x = srcPtr_f24->y.y.y;
+    src_f24->z.y.x = srcPtr_f24->y.y.z;
+    src_f24->x.y.y = srcPtr_f24->y.y.w;
+
+    src_f24->y.y.y = srcPtr_f24->z.x.x;
+    src_f24->z.y.y = srcPtr_f24->z.x.y;
+    src_f24->x.y.z = srcPtr_f24->z.x.z;
+    src_f24->y.y.z = srcPtr_f24->z.x.w;
+    src_f24->z.y.z = srcPtr_f24->z.y.x;
+    src_f24->x.y.w = srcPtr_f24->z.y.y;
+    src_f24->y.y.w = srcPtr_f24->z.y.z;
+    src_f24->z.y.w = srcPtr_f24->z.y.w;
+}
+
+// I8 loads with layout toggle PKD3 to PLN3 (24 I8 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(signed char *srcPtr, uint srcIdx, d_float24 *src_f24)
+{
+    d_int6 src = *((d_int6 *)(&srcPtr[srcIdx]));
+
+    src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack2(src.x.y), rpp_hip_unpack1(src.y.x));
+    src_f24->x.y = make_float4(rpp_hip_unpack0(src.y.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack2(src.z.x), rpp_hip_unpack1(src.z.y));
+
+    src_f24->y.x = make_float4(rpp_hip_unpack1(src.x.x), rpp_hip_unpack0(src.x.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack2(src.y.x));
+    src_f24->y.y = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack0(src.z.x), rpp_hip_unpack3(src.z.x), rpp_hip_unpack2(src.z.y));
+
+    src_f24->z.x = make_float4(rpp_hip_unpack2(src.x.x), rpp_hip_unpack1(src.x.y), rpp_hip_unpack0(src.y.x), rpp_hip_unpack3(src.y.x));
+    src_f24->z.y = make_float4(rpp_hip_unpack2(src.y.y), rpp_hip_unpack1(src.z.x), rpp_hip_unpack0(src.z.y), rpp_hip_unpack3(src.z.y));
+}
+
+// F16 loads with layout toggle PKD3 to PLN3 (24 F16 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(half *srcPtr, uint srcIdx, d_float24 *src_f24)
+{
+    d_half24 *src_h24;
+    src_h24 = (d_half24 *)&srcPtr[srcIdx];
+
+    src_f24->x.x.x = __half2float(__low2half(src_h24->x.x.x));
+    src_f24->x.x.y = __half2float(__high2half(src_h24->x.x.y));
+    src_f24->x.x.z = __half2float(__low2half(src_h24->x.y.y));
+    src_f24->x.x.w = __half2float(__high2half(src_h24->y.x.x));
+    src_f24->x.y.x = __half2float(__low2half(src_h24->y.y.x));
+    src_f24->x.y.y = __half2float(__high2half(src_h24->y.y.y));
+    src_f24->x.y.z = __half2float(__low2half(src_h24->z.x.y));
+    src_f24->x.y.w = __half2float(__high2half(src_h24->z.y.x));
+
+    src_f24->y.x.x = __half2float(__high2half(src_h24->x.x.x));
+    src_f24->y.x.y = __half2float(__low2half(src_h24->x.y.x));
+    src_f24->y.x.z = __half2float(__high2half(src_h24->x.y.y));
+    src_f24->y.x.w = __half2float(__low2half(src_h24->y.x.y));
+    src_f24->y.y.x = __half2float(__high2half(src_h24->y.y.x));
+    src_f24->y.y.y = __half2float(__low2half(src_h24->z.x.x));
+    src_f24->y.y.z = __half2float(__high2half(src_h24->z.x.y));
+    src_f24->y.y.w = __half2float(__low2half(src_h24->z.y.y));
+
+    src_f24->z.x.x = __half2float(__low2half(src_h24->x.x.y));
+    src_f24->z.x.y = __half2float(__high2half(src_h24->x.y.x));
+    src_f24->z.x.z = __half2float(__low2half(src_h24->y.x.x));
+    src_f24->z.x.w = __half2float(__high2half(src_h24->y.x.y));
+    src_f24->z.y.x = __half2float(__low2half(src_h24->y.y.y));
+    src_f24->z.y.y = __half2float(__high2half(src_h24->z.x.x));
+    src_f24->z.y.z = __half2float(__low2half(src_h24->z.y.x));
+    src_f24->z.y.w = __half2float(__high2half(src_h24->z.y.y));
+}
+
+// U8 loads with layout toggle PLN3 to PKD3 (24 U8 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(uchar *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24)
+{
+    d_uint6 src;
+
+    src.x = *((uint2 *)(&srcPtr[srcIdx]));
+    srcIdx += increment;
+    src.y = *((uint2 *)(&srcPtr[srcIdx]));
+    srcIdx += increment;
+    src.z = *((uint2 *)(&srcPtr[srcIdx]));
+
+    src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack0(src.y.x), rpp_hip_unpack0(src.z.x), rpp_hip_unpack1(src.x.x));
+    src_f24->x.y = make_float4(rpp_hip_unpack1(src.y.x), rpp_hip_unpack1(src.z.x), rpp_hip_unpack2(src.x.x), rpp_hip_unpack2(src.y.x));
+    src_f24->y.x = make_float4(rpp_hip_unpack2(src.z.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack3(src.y.x), rpp_hip_unpack3(src.z.x));
+    src_f24->y.y = make_float4(rpp_hip_unpack0(src.x.y), rpp_hip_unpack0(src.y.y), rpp_hip_unpack0(src.z.y), rpp_hip_unpack1(src.x.y));
+    src_f24->z.x = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack1(src.z.y), rpp_hip_unpack2(src.x.y), rpp_hip_unpack2(src.y.y));
+    src_f24->z.y = make_float4(rpp_hip_unpack2(src.z.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack3(src.z.y));
+}
+
+// F32 loads with layout toggle PLN3 to PKD3 (24 F32 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(float *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24)
+{
+    float *srcPtrR, *srcPtrG, *srcPtrB;
+    srcPtrR = srcPtr + srcIdx;
+    srcPtrG = srcPtrR + increment;
+    srcPtrB = srcPtrG + increment;
+
+    d_float8 *srcPtrR_f8, *srcPtrG_f8, *srcPtrB_f8;
+
+    srcPtrR_f8 = (d_float8 *)srcPtrR;
+    srcPtrG_f8 = (d_float8 *)srcPtrG;
+    srcPtrB_f8 = (d_float8 *)srcPtrB;
+
+    src_f24->x.x.x = srcPtrR_f8->x.x;
+    src_f24->x.x.y = srcPtrG_f8->x.x;
+    src_f24->x.x.z = srcPtrB_f8->x.x;
+
+    src_f24->x.x.w = srcPtrR_f8->x.y;
+    src_f24->x.y.x = srcPtrG_f8->x.y;
+    src_f24->x.y.y = srcPtrB_f8->x.y;
+
+    src_f24->x.y.z = srcPtrR_f8->x.z;
+    src_f24->x.y.w = srcPtrG_f8->x.z;
+    src_f24->y.x.x = srcPtrB_f8->x.z;
+
+    src_f24->y.x.y = srcPtrR_f8->x.w;
+    src_f24->y.x.z = srcPtrG_f8->x.w;
+    src_f24->y.x.w = srcPtrB_f8->x.w;
+
+    src_f24->y.y.x = srcPtrR_f8->y.x;
+    src_f24->y.y.y = srcPtrG_f8->y.x;
+    src_f24->y.y.z = srcPtrB_f8->y.x;
+
+    src_f24->y.y.w = srcPtrR_f8->y.y;
+    src_f24->z.x.x = srcPtrG_f8->y.y;
+    src_f24->z.x.y = srcPtrB_f8->y.y;
+
+    src_f24->z.x.z = srcPtrR_f8->y.z;
+    src_f24->z.x.w = srcPtrG_f8->y.z;
+    src_f24->z.y.x = srcPtrB_f8->y.z;
+
+    src_f24->z.y.y = srcPtrR_f8->y.w;
+    src_f24->z.y.z = srcPtrG_f8->y.w;
+    src_f24->z.y.w = srcPtrB_f8->y.w;
+}
+
+// I8 loads with layout toggle PLN3 to PKD3 (24 I8 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(signed char *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24)
+{
+    d_int6 src;
+
+    src.x = *((int2 *)(&srcPtr[srcIdx]));
+    srcIdx += increment;
+    src.y = *((int2 *)(&srcPtr[srcIdx]));
+    srcIdx += increment;
+    src.z = *((int2 *)(&srcPtr[srcIdx]));
+
+    src_f24->x.x = make_float4(rpp_hip_unpack0(src.x.x), rpp_hip_unpack0(src.y.x), rpp_hip_unpack0(src.z.x), rpp_hip_unpack1(src.x.x));
+    src_f24->x.y = make_float4(rpp_hip_unpack1(src.y.x), rpp_hip_unpack1(src.z.x), rpp_hip_unpack2(src.x.x), rpp_hip_unpack2(src.y.x));
+    src_f24->y.x = make_float4(rpp_hip_unpack2(src.z.x), rpp_hip_unpack3(src.x.x), rpp_hip_unpack3(src.y.x), rpp_hip_unpack3(src.z.x));
+    src_f24->y.y = make_float4(rpp_hip_unpack0(src.x.y), rpp_hip_unpack0(src.y.y), rpp_hip_unpack0(src.z.y), rpp_hip_unpack1(src.x.y));
+    src_f24->z.x = make_float4(rpp_hip_unpack1(src.y.y), rpp_hip_unpack1(src.z.y), rpp_hip_unpack2(src.x.y), rpp_hip_unpack2(src.y.y));
+    src_f24->z.y = make_float4(rpp_hip_unpack2(src.z.y), rpp_hip_unpack3(src.x.y), rpp_hip_unpack3(src.y.y), rpp_hip_unpack3(src.z.y));
+}
+
+// F16 loads with layout toggle PLN3 to PKD3 (24 F16 pixels)
+
+__device__ __forceinline__ void rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(half *srcPtr, uint srcIdx, uint increment, d_float24 *src_f24)
+{
+    half *srcPtrR, *srcPtrG, *srcPtrB;
+    srcPtrR = srcPtr + srcIdx;
+    srcPtrG = srcPtrR + increment;
+    srcPtrB = srcPtrG + increment;
+
+    d_half8 *srcR_h8, *srcG_h8, *srcB_h8;
+    srcR_h8 = (d_half8 *)srcPtrR;
+    srcG_h8 = (d_half8 *)srcPtrG;
+    srcB_h8 = (d_half8 *)srcPtrB;
+
+    src_f24->x.x.x = __half2float(__low2half(srcR_h8->x.x));
+    src_f24->x.x.y = __half2float(__low2half(srcG_h8->x.x));
+    src_f24->x.x.z = __half2float(__low2half(srcB_h8->x.x));
+
+    src_f24->x.x.w = __half2float(__high2half(srcR_h8->x.x));
+    src_f24->x.y.x = __half2float(__high2half(srcG_h8->x.x));
+    src_f24->x.y.y = __half2float(__high2half(srcB_h8->x.x));
+
+    src_f24->x.y.z = __half2float(__low2half(srcR_h8->x.y));
+    src_f24->x.y.w = __half2float(__low2half(srcG_h8->x.y));
+    src_f24->y.x.x = __half2float(__low2half(srcB_h8->x.y));
+
+    src_f24->y.x.y = __half2float(__high2half(srcR_h8->x.y));
+    src_f24->y.x.z = __half2float(__high2half(srcG_h8->x.y));
+    src_f24->y.x.w = __half2float(__high2half(srcB_h8->x.y));
+
+    src_f24->y.y.x = __half2float(__low2half(srcR_h8->y.x));
+    src_f24->y.y.y = __half2float(__low2half(srcG_h8->y.x));
+    src_f24->y.y.z = __half2float(__low2half(srcB_h8->y.x));
+
+    src_f24->y.y.w = __half2float(__high2half(srcR_h8->y.x));
+    src_f24->z.x.x = __half2float(__high2half(srcG_h8->y.x));
+    src_f24->z.x.y = __half2float(__high2half(srcB_h8->y.x));
+
+    src_f24->z.x.z = __half2float(__low2half(srcR_h8->y.y));
+    src_f24->z.x.w = __half2float(__low2half(srcG_h8->y.y));
+    src_f24->z.y.x = __half2float(__low2half(srcB_h8->y.y));
+
+    src_f24->z.y.y = __half2float(__high2half(srcR_h8->y.y));
+    src_f24->z.y.z = __half2float(__high2half(srcG_h8->y.y));
+    src_f24->z.y.w = __half2float(__high2half(srcB_h8->y.y));
+}
+
+// -------------------- Set 4 - Stores --------------------
+
+// U8 stores without layout toggle (8 U8 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(uchar *dstPtr, uint dstIdx, d_float8 *dst_f8)
+{
+    uint2 dst;
+    dst.x = rpp_hip_pack(dst_f8->x);
+    dst.y = rpp_hip_pack(dst_f8->y);
+    *((uint2 *)(&dstPtr[dstIdx])) = dst;
+}
+
+// F32 stores without layout toggle (8 F32 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(float *dstPtr, uint dstIdx, d_float8 *dst_f8)
+{
+    *((d_float8 *)(&dstPtr[dstIdx])) = *dst_f8;
+}
+
+// I8 stores without layout toggle (8 I8 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(signed char *dstPtr, uint dstIdx, d_float8 *dst_f8)
+{
+    uint2 dst;
+    dst.x = rpp_hip_pack_i8(dst_f8->x);
+    dst.y = rpp_hip_pack_i8(dst_f8->y);
+    *((uint2 *)(&dstPtr[dstIdx])) = dst;
+}
+
+// F16 stores without layout toggle (8 F16 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float8_and_store8(half *dstPtr, uint dstIdx, d_float8 *dst_f8)
+{
+    d_half8 dst_h8;
+
+    dst_h8.x.x = __float22half2_rn(make_float2(dst_f8->x.x, dst_f8->x.y));
+    dst_h8.x.y = __float22half2_rn(make_float2(dst_f8->x.z, dst_f8->x.w));
+    dst_h8.y.x = __float22half2_rn(make_float2(dst_f8->y.x, dst_f8->y.y));
+    dst_h8.y.y = __float22half2_rn(make_float2(dst_f8->y.z, dst_f8->y.w));
+
+    *((d_half8 *)(&dstPtr[dstIdx])) = dst_h8;
+}
+
+// U8 stores without layout toggle (24 U8 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(uchar *dstPtr, uint dstIdx, d_float24 *dst_f24)
+{
+    d_uint6 dst;
+
+    dst.x.x = rpp_hip_pack(dst_f24->x.x);
+    dst.x.y = rpp_hip_pack(dst_f24->x.y);
+    dst.y.x = rpp_hip_pack(dst_f24->y.x);
+    dst.y.y = rpp_hip_pack(dst_f24->y.y);
+    dst.z.x = rpp_hip_pack(dst_f24->z.x);
+    dst.z.y = rpp_hip_pack(dst_f24->z.y);
+
+    *((d_uint6 *)(&dstPtr[dstIdx])) = dst;
+}
+
+// F32 stores without layout toggle (24 F32 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(float *dstPtr, uint dstIdx, d_float24 *dst_f24)
+{
+    *((d_float24 *)(&dstPtr[dstIdx])) = *dst_f24;
+}
+
+// I8 stores without layout toggle (24 I8 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(signed char *dstPtr, uint dstIdx, d_float24 *dst_f24)
+{
+    d_uint6 dst;
+
+    dst.x.x = rpp_hip_pack_i8(dst_f24->x.x);
+    dst.x.y = rpp_hip_pack_i8(dst_f24->x.y);
+    dst.y.x = rpp_hip_pack_i8(dst_f24->y.x);
+    dst.y.y = rpp_hip_pack_i8(dst_f24->y.y);
+    dst.z.x = rpp_hip_pack_i8(dst_f24->z.x);
+    dst.z.y = rpp_hip_pack_i8(dst_f24->z.y);
+
+    *((d_uint6 *)(&dstPtr[dstIdx])) = dst;
+}
+
+// F16 stores without layout toggle (24 F16 pixels)
+
+__device__ __forceinline__ void rpp_hip_pack_float24_and_store24(half *dstPtr, uint dstIdx, d_float24 *dst_f24)
+{
+    d_half24 dst_h24;
+
+    dst_h24.x.x.x = __float22half2_rn(make_float2(dst_f24->x.x.x, dst_f24->x.x.y));
+    dst_h24.x.x.y = __float22half2_rn(make_float2(dst_f24->x.x.z, dst_f24->x.x.w));
+    dst_h24.x.y.x = __float22half2_rn(make_float2(dst_f24->x.y.x, dst_f24->x.y.y));
+    dst_h24.x.y.y = __float22half2_rn(make_float2(dst_f24->x.y.z, dst_f24->x.y.w));
+
+    dst_h24.y.x.x = __float22half2_rn(make_float2(dst_f24->y.x.x, dst_f24->y.x.y));
+    dst_h24.y.x.y = __float22half2_rn(make_float2(dst_f24->y.x.z, dst_f24->y.x.w));
+    dst_h24.y.y.x = __float22half2_rn(make_float2(dst_f24->y.y.x, dst_f24->y.y.y));
+    dst_h24.y.y.y = __float22half2_rn(make_float2(dst_f24->y.y.z, dst_f24->y.y.w));
+
+    dst_h24.z.x.x = __float22half2_rn(make_float2(dst_f24->z.x.x, dst_f24->z.x.y));
+    dst_h24.z.x.y = __float22half2_rn(make_float2(dst_f24->z.x.z, dst_f24->z.x.w));
+    dst_h24.z.y.x = __float22half2_rn(make_float2(dst_f24->z.y.x, dst_f24->z.y.y));
+    dst_h24.z.y.y = __float22half2_rn(make_float2(dst_f24->z.y.z, dst_f24->z.y.w));
+
+    *((d_half24 *)(&dstPtr[dstIdx])) = dst_h24;
+}
+
+// -------------------- Set 5 - Other --------------------
+
+// float4 pixel check for 0-255 range
+
+__device__ __forceinline__ float4 rpp_hip_pixel_check(float4 src_f4)
+{
+    return make_float4(fminf(fmaxf(src_f4.x, 0), 255),
+                       fminf(fmaxf(src_f4.y, 0), 255),
+                       fminf(fmaxf(src_f4.z, 0), 255),
+                       fminf(fmaxf(src_f4.w, 0), 255));
+}
+
 #endif //RPP_HIP_COMMON_H
\ No newline at end of file
diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt
index 812af6f86..3446c93b7 100644
--- a/src/modules/CMakeLists.txt
+++ b/src/modules/CMakeLists.txt
@@ -74,7 +74,7 @@ if( "${BACKEND}" STREQUAL "HIP")
 
     # Set HIP compiler and flags
     set(CMAKE_CXX_COMPILER ${COMPILER_FOR_HIP})
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -g3 -std=c++14")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HIPCC_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -mfma -g3 -std=c++14")
 
     # Add HIP specific preprocessor flags
     add_definitions(-DHIP_COMPILE)
@@ -94,7 +94,7 @@ elseif( "${BACKEND}" STREQUAL "OCL")
 
     # Set OpenCL compiler and flags
     set(CMAKE_CXX_COMPILER ${COMPILER_FOR_OPENCL}) # GCC and G++ donst work for creating .so file
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -g3 -std=c++14 -Wno-deprecated-declarations")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Ofast -msse4.2 -msse4.1 -mssse3 -mavx2 -mfma -g3 -std=c++14 -Wno-deprecated-declarations")
 
     # Add OpenCL specific preprocessor flags
     add_definitions(-DOCL_COMPILE)
diff --git a/src/modules/cpu/host_tensor_augmentations.hpp b/src/modules/cpu/host_tensor_augmentations.hpp
new file mode 100644
index 000000000..1831b9463
--- /dev/null
+++ b/src/modules/cpu/host_tensor_augmentations.hpp
@@ -0,0 +1,1068 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HOST_TENSOR_AUGMENTATIONS_HPP
+#define HOST_TENSOR_AUGMENTATIONS_HPP
+
+#include "cpu/rpp_cpu_simd.hpp"
+#include <cpu/rpp_cpu_common.hpp>
+#include <stdlib.h>
+#include <time.h>
+#include <algorithm>
+
+/************ brightness ************/
+
+RppStatus brightness_u8_u8_host_tensor(Rpp8u *srcPtr,
+                                       RpptDescPtr srcDescPtr,
+                                       Rpp8u *dstPtr,
+                                       RpptDescPtr dstDescPtr,
+                                       Rpp32f *alphaTensor,
+                                       Rpp32f *betaTensor,
+                                       RpptROIPtr roiTensorPtrSrc,
+                                       RpptRoiType roiType,
+                                       RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h;
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtr;
+
+        if (&roiTensorPtrSrc[batchCount] == NULL)
+        {
+            roiPtr = roiPtrDefault;
+        }
+        else
+        {
+            RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+
+            RpptROI roiImage;
+            RpptROIPtr roiPtrImage;
+
+            if (roiType == RpptRoiType::LTRB)
+            {
+                roiPtrImage = &roiImage;
+                compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage);
+            }
+            else if (roiType == RpptRoiType::XYWH)
+            {
+                roiPtrImage = roiPtrInput;
+            }
+
+            roiPtr = &roi;
+            compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault);
+        }
+
+        Rpp32f alpha = alphaTensor[batchCount];
+        Rpp32f beta = betaTensor[batchCount];
+
+        Rpp8u *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m128 pMul = _mm_set1_ps(alpha);
+        __m128 pAdd = _mm_set1_ps(beta);
+
+        Rpp8u *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Brightness with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = bufferLength & ~47;
+
+            Rpp8u *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=48)
+                {
+                    __m128 p[12];
+
+                    rpp_simd_load(rpp_load48_u8pkd3_to_f32pln3, srcPtrTemp, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+                    p[3] = _mm_fmadd_ps(p[3], pMul, pAdd);    // brightness adjustment
+
+                    p[4] = _mm_fmadd_ps(p[4], pMul, pAdd);    // brightness adjustment
+                    p[5] = _mm_fmadd_ps(p[5], pMul, pAdd);    // brightness adjustment
+                    p[6] = _mm_fmadd_ps(p[6], pMul, pAdd);    // brightness adjustment
+                    p[7] = _mm_fmadd_ps(p[7], pMul, pAdd);    // brightness adjustment
+
+                    p[8] = _mm_fmadd_ps(p[8], pMul, pAdd);    // brightness adjustment
+                    p[9] = _mm_fmadd_ps(p[9], pMul, pAdd);    // brightness adjustment
+                    p[10] = _mm_fmadd_ps(p[10], pMul, pAdd);    // brightness adjustment
+                    p[11] = _mm_fmadd_ps(p[11], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);
+
+                    srcPtrTemp += 48;
+                    dstPtrTempR += 16;
+                    dstPtrTempG += 16;
+                    dstPtrTempB += 16;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount+=3)
+                {
+                    *dstPtrTempR = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta);
+                    dstPtrTempR++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempG = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta);
+                    dstPtrTempG++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempB = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta);
+                    dstPtrTempB++;
+                    srcPtrTemp++;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = bufferLength & ~47;
+
+            Rpp8u *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp8u *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=16)
+                {
+                    __m128 p[12];
+
+                    rpp_simd_load(rpp_load48_u8pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+                    p[3] = _mm_fmadd_ps(p[3], pMul, pAdd);    // brightness adjustment
+
+                    p[4] = _mm_fmadd_ps(p[4], pMul, pAdd);    // brightness adjustment
+                    p[5] = _mm_fmadd_ps(p[5], pMul, pAdd);    // brightness adjustment
+                    p[6] = _mm_fmadd_ps(p[6], pMul, pAdd);    // brightness adjustment
+                    p[7] = _mm_fmadd_ps(p[7], pMul, pAdd);    // brightness adjustment
+
+                    p[8] = _mm_fmadd_ps(p[8], pMul, pAdd);    // brightness adjustment
+                    p[9] = _mm_fmadd_ps(p[9], pMul, pAdd);    // brightness adjustment
+                    p[10] = _mm_fmadd_ps(p[10], pMul, pAdd);    // brightness adjustment
+                    p[11] = _mm_fmadd_ps(p[11], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store48_f32pln3_to_u8pkd3, dstPtrTemp, p);
+
+                    srcPtrTempR += 16;
+                    srcPtrTempG += 16;
+                    srcPtrTempB += 16;
+                    dstPtrTemp += 48;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTempR)) * alpha) + beta);
+                    dstPtrTemp++;
+                    srcPtrTempR++;
+
+                    *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTempG)) * alpha) + beta);
+                    dstPtrTemp++;
+                    srcPtrTempG++;
+
+                    *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTempB)) * alpha) + beta);
+                    dstPtrTemp++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+            Rpp32u alignedLength = bufferLength & ~15;
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8u *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrChannel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+                {
+                    Rpp8u *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount+=16)
+                    {
+                        __m128 p[4];
+
+                        rpp_simd_load(rpp_load16_u8_to_f32, srcPtrTemp, p);
+
+                        p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                        p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                        p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+                        p[3] = _mm_fmadd_ps(p[3], pMul, pAdd);    // brightness adjustment
+
+                        rpp_simd_store(rpp_store16_f32_to_u8, dstPtrTemp, p);
+
+                        srcPtrTemp +=16;
+                        dstPtrTemp +=16;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp = (Rpp8u) RPPPIXELCHECK((((Rpp32f) (*srcPtrTemp)) * alpha) + beta);
+
+                        dstPtrTemp++;
+                        srcPtrTemp++;
+                    }
+
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtrChannel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,
+                                         RpptDescPtr srcDescPtr,
+                                         Rpp32f *dstPtr,
+                                         RpptDescPtr dstDescPtr,
+                                         Rpp32f *alphaTensor,
+                                         Rpp32f *betaTensor,
+                                         RpptROIPtr roiTensorPtrSrc,
+                                         RpptRoiType roiType,
+                                         RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h;
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtr;
+
+        if (&roiTensorPtrSrc[batchCount] == NULL)
+        {
+            roiPtr = roiPtrDefault;
+        }
+        else
+        {
+            RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+
+            RpptROI roiImage;
+            RpptROIPtr roiPtrImage;
+
+            if (roiType == RpptRoiType::LTRB)
+            {
+                roiPtrImage = &roiImage;
+                compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage);
+            }
+            else if (roiType == RpptRoiType::XYWH)
+            {
+                roiPtrImage = roiPtrInput;
+            }
+
+            roiPtr = &roi;
+            compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault);
+        }
+
+        Rpp32f alpha = alphaTensor[batchCount];
+        Rpp32f beta = betaTensor[batchCount] * 0.0039216; // 1/255
+
+        Rpp32f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m128 pMul = _mm_set1_ps(alpha);
+        __m128 pAdd = _mm_set1_ps(beta);
+
+        Rpp32f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Brightness with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = bufferLength & ~11;
+
+            Rpp32f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=12)
+                {
+                    __m128 p[4];
+
+                    rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);
+
+                    srcPtrTemp += 12;
+                    dstPtrTempR += 4;
+                    dstPtrTempG += 4;
+                    dstPtrTempB += 4;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount+=3)
+                {
+                    *dstPtrTempR = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta);
+                    dstPtrTempR++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempG = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta);
+                    dstPtrTempG++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempB = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta);
+                    dstPtrTempB++;
+                    srcPtrTemp++;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = bufferLength & ~11;
+
+            Rpp32f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp32f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=4)
+                {
+                    __m128 p[4];
+
+                    rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp, p);
+
+                    srcPtrTempR += 4;
+                    srcPtrTempG += 4;
+                    srcPtrTempB += 4;
+                    dstPtrTemp += 12;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTempR * alpha + beta);
+                    dstPtrTemp++;
+                    srcPtrTempR++;
+
+                    *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTempG * alpha + beta);
+                    dstPtrTemp++;
+                    srcPtrTempG++;
+
+                    *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTempB * alpha + beta);
+                    dstPtrTemp++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+            Rpp32u alignedLength = bufferLength & ~3;
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp32f *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrChannel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+                {
+                    Rpp32f *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount+=4)
+                    {
+                        __m128 p[1];
+
+                        rpp_simd_load(rpp_load4_f32_to_f32, srcPtrTemp, p);
+
+                        p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+
+                        rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp, p);
+
+                        srcPtrTemp += 4;
+                        dstPtrTemp += 4;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp = RPPPIXELCHECKF32(*srcPtrTemp * alpha + beta);
+
+                        dstPtrTemp++;
+                        srcPtrTemp++;
+                    }
+
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtrChannel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,
+                                         RpptDescPtr srcDescPtr,
+                                         Rpp16f *dstPtr,
+                                         RpptDescPtr dstDescPtr,
+                                         Rpp32f *alphaTensor,
+                                         Rpp32f *betaTensor,
+                                         RpptROIPtr roiTensorPtrSrc,
+                                         RpptRoiType roiType,
+                                         RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h;
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtr;
+
+        if (&roiTensorPtrSrc[batchCount] == NULL)
+        {
+            roiPtr = roiPtrDefault;
+        }
+        else
+        {
+            RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+
+            RpptROI roiImage;
+            RpptROIPtr roiPtrImage;
+
+            if (roiType == RpptRoiType::LTRB)
+            {
+                roiPtrImage = &roiImage;
+                compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage);
+            }
+            else if (roiType == RpptRoiType::XYWH)
+            {
+                roiPtrImage = roiPtrInput;
+            }
+
+            roiPtr = &roi;
+            compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault);
+        }
+
+        Rpp32f alpha = alphaTensor[batchCount];
+        Rpp32f beta = betaTensor[batchCount] * 0.0039216; // 1/255
+
+        Rpp16f *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m128 pMul = _mm_set1_ps(alpha);
+        __m128 pAdd = _mm_set1_ps(beta);
+
+        Rpp16f *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Brightness with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = bufferLength & ~11;
+
+            Rpp16f *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=12)
+                {
+                    Rpp32f srcPtrTemp_ps[12], dstPtrTemp_ps[12];
+
+                    for(int cnt = 0; cnt < 12; cnt++)
+                    {
+                        *(srcPtrTemp_ps + cnt) = (Rpp32f) *(srcPtrTemp + cnt);
+                    }
+
+                    __m128 p[4];
+
+                    rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp_ps, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTemp_ps, dstPtrTemp_ps + 4, dstPtrTemp_ps + 8, p);
+
+                    for(int cnt = 0; cnt < 4; cnt++)
+                    {
+                        *(dstPtrTempR + cnt) = (Rpp16f) *(dstPtrTemp_ps + cnt);
+                        *(dstPtrTempG + cnt) = (Rpp16f) *(dstPtrTemp_ps + 4 + cnt);
+                        *(dstPtrTempB + cnt) = (Rpp16f) *(dstPtrTemp_ps + 8 + cnt);
+                    }
+
+                    srcPtrTemp += 12;
+                    dstPtrTempR += 4;
+                    dstPtrTempG += 4;
+                    dstPtrTempB += 4;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount+=3)
+                {
+                    *dstPtrTempR = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta);
+                    dstPtrTempR++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempG = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta);
+                    dstPtrTempG++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempB = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta);
+                    dstPtrTempB++;
+                    srcPtrTemp++;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = bufferLength & ~11;
+
+            Rpp16f *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp16f *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=4)
+                {
+                    Rpp32f srcPtrTemp_ps[12], dstPtrTemp_ps[13];
+
+                    for(int cnt = 0; cnt < 4; cnt++)
+                    {
+                        *(srcPtrTemp_ps + cnt) = (Rpp32f) *(srcPtrTempR + cnt);
+                        *(srcPtrTemp_ps + 4 + cnt) = (Rpp32f) *(srcPtrTempG + cnt);
+                        *(srcPtrTemp_ps + 8 + cnt) = (Rpp32f) *(srcPtrTempB + cnt);
+                    }
+
+                    __m128 p[4];
+
+                    rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTemp_ps, srcPtrTemp_ps + 4, srcPtrTemp_ps + 8, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp_ps, p);
+
+                    for(int cnt = 0; cnt < 12; cnt++)
+                    {
+                        *(dstPtrTemp + cnt) = (Rpp16f) *(dstPtrTemp_ps + cnt);
+                    }
+
+                    srcPtrTempR += 4;
+                    srcPtrTempG += 4;
+                    srcPtrTempB += 4;
+                    dstPtrTemp += 12;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTempR * alpha + beta);
+                    dstPtrTemp++;
+                    srcPtrTempR++;
+
+                    *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTempG * alpha + beta);
+                    dstPtrTemp++;
+                    srcPtrTempG++;
+
+                    *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTempB * alpha + beta);
+                    dstPtrTemp++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+            Rpp32u alignedLength = bufferLength & ~3;
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp16f *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrChannel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+                {
+                    Rpp16f *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount+=4)
+                    {
+                        Rpp32f srcPtrTemp_ps[4], dstPtrTemp_ps[4];
+
+                        for(int cnt = 0; cnt < 4; cnt++)
+                        {
+                            *(srcPtrTemp_ps + cnt) = (Rpp16f) *(srcPtrTemp + cnt);
+                        }
+
+                        __m128 p[1];
+
+                        rpp_simd_load(rpp_load4_f32_to_f32, srcPtrTemp_ps, p);
+
+                        p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+
+                        rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp_ps, p);
+
+                        for(int cnt = 0; cnt < 4; cnt++)
+                        {
+                            *(dstPtrTemp + cnt) = (Rpp16f) *(dstPtrTemp_ps + cnt);
+                        }
+
+                        srcPtrTemp += 4;
+                        dstPtrTemp += 4;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp = (Rpp16f) RPPPIXELCHECKF32((Rpp32f)*srcPtrTemp * alpha + beta);
+
+                        dstPtrTemp++;
+                        srcPtrTemp++;
+                    }
+
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtrChannel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+RppStatus brightness_i8_i8_host_tensor(Rpp8s *srcPtr,
+                                       RpptDescPtr srcDescPtr,
+                                       Rpp8s *dstPtr,
+                                       RpptDescPtr dstDescPtr,
+                                       Rpp32f *alphaTensor,
+                                       Rpp32f *betaTensor,
+                                       RpptROIPtr roiTensorPtrSrc,
+                                       RpptRoiType roiType,
+                                       RppLayoutParams layoutParams)
+{
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = srcDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = srcDescPtr->h;
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(srcDescPtr->n)
+    for(int batchCount = 0; batchCount < srcDescPtr->n; batchCount++)
+    {
+        RpptROI roi;
+        RpptROIPtr roiPtr;
+
+        if (&roiTensorPtrSrc[batchCount] == NULL)
+        {
+            roiPtr = roiPtrDefault;
+        }
+        else
+        {
+            RpptROIPtr roiPtrInput = &roiTensorPtrSrc[batchCount];
+
+            RpptROI roiImage;
+            RpptROIPtr roiPtrImage;
+
+            if (roiType == RpptRoiType::LTRB)
+            {
+                roiPtrImage = &roiImage;
+                compute_xywh_from_ltrb_host(roiPtrInput, roiPtrImage);
+            }
+            else if (roiType == RpptRoiType::XYWH)
+            {
+                roiPtrImage = roiPtrInput;
+            }
+
+            roiPtr = &roi;
+            compute_roi_boundary_check_host(roiPtrImage, roiPtr, roiPtrDefault);
+        }
+
+        Rpp32f alpha = alphaTensor[batchCount];
+        Rpp32f beta = betaTensor[batchCount];
+
+        Rpp8s *srcPtrImage, *dstPtrImage;
+        srcPtrImage = srcPtr + batchCount * srcDescPtr->strides.nStride;
+        dstPtrImage = dstPtr + batchCount * dstDescPtr->strides.nStride;
+
+        Rpp32u bufferLength = roiPtr->xywhROI.roiWidth * layoutParams.bufferMultiplier;
+
+        __m128 pMul = _mm_set1_ps(alpha);
+        __m128 pAdd = _mm_set1_ps(beta);
+
+        Rpp8s *srcPtrChannel, *dstPtrChannel;
+        srcPtrChannel = srcPtrImage + (roiPtr->xywhROI.xy.y * srcDescPtr->strides.hStride) + (roiPtr->xywhROI.xy.x * layoutParams.bufferMultiplier);
+        dstPtrChannel = dstPtrImage;
+
+        // Brightness with fused output-layout toggle (NHWC -> NCHW)
+        if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            Rpp32u alignedLength = bufferLength & ~47;
+
+            Rpp8s *srcPtrRow, *dstPtrRowR, *dstPtrRowG, *dstPtrRowB;
+            srcPtrRow = srcPtrChannel;
+            dstPtrRowR = dstPtrChannel;
+            dstPtrRowG = dstPtrRowR + dstDescPtr->strides.cStride;
+            dstPtrRowB = dstPtrRowG + dstDescPtr->strides.cStride;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTemp, *dstPtrTempR, *dstPtrTempG, *dstPtrTempB;
+                srcPtrTemp = srcPtrRow;
+                dstPtrTempR = dstPtrRowR;
+                dstPtrTempG = dstPtrRowG;
+                dstPtrTempB = dstPtrRowB;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=48)
+                {
+                    __m128 p[12];
+
+                    rpp_simd_load(rpp_load48_i8pkd3_to_f32pln3, srcPtrTemp, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+                    p[3] = _mm_fmadd_ps(p[3], pMul, pAdd);    // brightness adjustment
+
+                    p[4] = _mm_fmadd_ps(p[4], pMul, pAdd);    // brightness adjustment
+                    p[5] = _mm_fmadd_ps(p[5], pMul, pAdd);    // brightness adjustment
+                    p[6] = _mm_fmadd_ps(p[6], pMul, pAdd);    // brightness adjustment
+                    p[7] = _mm_fmadd_ps(p[7], pMul, pAdd);    // brightness adjustment
+
+                    p[8] = _mm_fmadd_ps(p[8], pMul, pAdd);    // brightness adjustment
+                    p[9] = _mm_fmadd_ps(p[9], pMul, pAdd);    // brightness adjustment
+                    p[10] = _mm_fmadd_ps(p[10], pMul, pAdd);    // brightness adjustment
+                    p[11] = _mm_fmadd_ps(p[11], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p);
+
+                    srcPtrTemp += 48;
+                    dstPtrTempR += 16;
+                    dstPtrTempG += 16;
+                    dstPtrTempB += 16;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount+=3)
+                {
+                    *dstPtrTempR = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128);
+                    dstPtrTempR++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempG = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128);
+                    dstPtrTempG++;
+                    srcPtrTemp++;
+
+                    *dstPtrTempB = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128);
+                    dstPtrTempB++;
+                    srcPtrTemp++;
+                }
+
+                srcPtrRow += srcDescPtr->strides.hStride;
+                dstPtrRowR += dstDescPtr->strides.hStride;
+                dstPtrRowG += dstDescPtr->strides.hStride;
+                dstPtrRowB += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness with fused output-layout toggle (NCHW -> NHWC)
+        else if ((srcDescPtr->c == 3) && (srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            Rpp32u alignedLength = bufferLength & ~47;
+
+            Rpp8s *srcPtrRowR, *srcPtrRowG, *srcPtrRowB, *dstPtrRow;
+            srcPtrRowR = srcPtrChannel;
+            srcPtrRowG = srcPtrRowR + srcDescPtr->strides.cStride;
+            srcPtrRowB = srcPtrRowG + srcDescPtr->strides.cStride;
+            dstPtrRow = dstPtrChannel;
+
+            for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+            {
+                Rpp8s *srcPtrTempR, *srcPtrTempG, *srcPtrTempB, *dstPtrTemp;
+                srcPtrTempR = srcPtrRowR;
+                srcPtrTempG = srcPtrRowG;
+                srcPtrTempB = srcPtrRowB;
+                dstPtrTemp = dstPtrRow;
+
+                int vectorLoopCount = 0;
+                for (; vectorLoopCount < alignedLength; vectorLoopCount+=16)
+                {
+                    __m128 p[12];
+
+                    rpp_simd_load(rpp_load48_i8pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p);
+
+                    p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                    p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                    p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+                    p[3] = _mm_fmadd_ps(p[3], pMul, pAdd);    // brightness adjustment
+
+                    p[4] = _mm_fmadd_ps(p[4], pMul, pAdd);    // brightness adjustment
+                    p[5] = _mm_fmadd_ps(p[5], pMul, pAdd);    // brightness adjustment
+                    p[6] = _mm_fmadd_ps(p[6], pMul, pAdd);    // brightness adjustment
+                    p[7] = _mm_fmadd_ps(p[7], pMul, pAdd);    // brightness adjustment
+
+                    p[8] = _mm_fmadd_ps(p[8], pMul, pAdd);    // brightness adjustment
+                    p[9] = _mm_fmadd_ps(p[9], pMul, pAdd);    // brightness adjustment
+                    p[10] = _mm_fmadd_ps(p[10], pMul, pAdd);    // brightness adjustment
+                    p[11] = _mm_fmadd_ps(p[11], pMul, pAdd);    // brightness adjustment
+
+                    rpp_simd_store(rpp_store48_f32pln3_to_i8pkd3, dstPtrTemp, p);
+
+                    srcPtrTempR += 16;
+                    srcPtrTempG += 16;
+                    srcPtrTempB += 16;
+                    dstPtrTemp += 48;
+                }
+                for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                {
+                    *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTempR) + 128) * alpha) + beta - 128);
+                    dstPtrTemp++;
+                    srcPtrTempR++;
+
+                    *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTempG) + 128) * alpha) + beta - 128);
+                    dstPtrTemp++;
+                    srcPtrTempG++;
+
+                    *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTempB) + 128) * alpha) + beta - 128);
+                    dstPtrTemp++;
+                    srcPtrTempB++;
+                }
+
+                srcPtrRowR += srcDescPtr->strides.hStride;
+                srcPtrRowG += srcDescPtr->strides.hStride;
+                srcPtrRowB += srcDescPtr->strides.hStride;
+                dstPtrRow += dstDescPtr->strides.hStride;
+            }
+        }
+
+        // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
+        else
+        {
+            Rpp32u alignedLength = bufferLength & ~15;
+
+            for(int c = 0; c < layoutParams.channelParam; c++)
+            {
+                Rpp8s *srcPtrRow, *dstPtrRow;
+                srcPtrRow = srcPtrChannel;
+                dstPtrRow = dstPtrChannel;
+
+                for(int i = 0; i < roiPtr->xywhROI.roiHeight; i++)
+                {
+                    Rpp8s *srcPtrTemp, *dstPtrTemp;
+                    srcPtrTemp = srcPtrRow;
+                    dstPtrTemp = dstPtrRow;
+
+                    int vectorLoopCount = 0;
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount+=16)
+                    {
+                        __m128 p[4];
+
+                        rpp_simd_load(rpp_load16_i8_to_f32, srcPtrTemp, p);
+
+                        p[0] = _mm_fmadd_ps(p[0], pMul, pAdd);    // brightness adjustment
+                        p[1] = _mm_fmadd_ps(p[1], pMul, pAdd);    // brightness adjustment
+                        p[2] = _mm_fmadd_ps(p[2], pMul, pAdd);    // brightness adjustment
+                        p[3] = _mm_fmadd_ps(p[3], pMul, pAdd);    // brightness adjustment
+
+                        rpp_simd_store(rpp_store16_f32_to_i8, dstPtrTemp, p);
+
+                        srcPtrTemp +=16;
+                        dstPtrTemp +=16;
+                    }
+                    for (; vectorLoopCount < bufferLength; vectorLoopCount++)
+                    {
+                        *dstPtrTemp = (Rpp8s) RPPPIXELCHECKI8((((Rpp32f) (*srcPtrTemp) + 128) * alpha) + beta - 128);
+
+                        dstPtrTemp++;
+                        srcPtrTemp++;
+                    }
+
+                    srcPtrRow += srcDescPtr->strides.hStride;
+                    dstPtrRow += dstDescPtr->strides.hStride;
+                }
+
+                srcPtrChannel += srcDescPtr->strides.cStride;
+                dstPtrChannel += dstDescPtr->strides.cStride;
+            }
+        }
+    }
+
+    return RPP_SUCCESS;
+}
+
+#endif // HOST_TENSOR_AUGMENTATIONS_HPP
diff --git a/src/modules/hip/hip_tensor_augmentations.hpp b/src/modules/hip/hip_tensor_augmentations.hpp
new file mode 100644
index 000000000..d91f0281e
--- /dev/null
+++ b/src/modules/hip/hip_tensor_augmentations.hpp
@@ -0,0 +1,30 @@
+#include "hip/hip_runtime_api.h"
+#include "kernel/brightness.hpp"
+#include "kernel/roi_conversion.hpp"
+
+/******************** brightness ********************/
+
+template <typename T>
+RppStatus brightness_hip_tensor(T *srcPtr,
+                                RpptDescPtr srcDescPtr,
+                                T *dstPtr,
+                                RpptDescPtr dstDescPtr,
+                                RpptROIPtr roiTensorPtrSrc,
+                                RpptRoiType roiType,
+                                rpp::Handle& handle)
+{
+    if (roiType == RpptRoiType::LTRB)
+    {
+        hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc,
+                                             handle);
+    }
+
+    hip_exec_brightness_tensor(srcPtr,
+                               srcDescPtr,
+                               dstPtr,
+                               dstDescPtr,
+                               roiTensorPtrSrc,
+                               handle);
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/brightness.hpp b/src/modules/hip/kernel/brightness.hpp
new file mode 100644
index 000000000..6426dc039
--- /dev/null
+++ b/src/modules/hip/kernel/brightness.hpp
@@ -0,0 +1,288 @@
+#include <hip/hip_runtime.h>
+#include "hip/rpp_hip_common.hpp"
+
+__device__ void brightness_hip_compute(uchar *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4)
+{
+    dst_f8->x = src_f8->x * *alpha_f4 + *beta_f4;
+    dst_f8->y = src_f8->y * *alpha_f4 + *beta_f4;
+}
+
+__device__ void brightness_hip_compute(float *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4)
+{
+    dst_f8->x = src_f8->x * *alpha_f4 + *beta_f4 * (float4)0.0039216;
+    dst_f8->y = src_f8->y * *alpha_f4 + *beta_f4 * (float4)0.0039216;
+}
+
+__device__ void brightness_hip_compute(signed char *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4)
+{
+    dst_f8->x = rpp_hip_pixel_check((src_f8->x + (float4)128) * *alpha_f4 + *beta_f4) - (float4)128;
+    dst_f8->y = rpp_hip_pixel_check((src_f8->y + (float4)128) * *alpha_f4 + *beta_f4) - (float4)128;
+}
+
+__device__ void brightness_hip_compute(half *srcPtr, d_float8 *src_f8, d_float8 *dst_f8, float4 *alpha_f4, float4 *beta_f4)
+{
+    dst_f8->x = src_f8->x * *alpha_f4 + *beta_f4 * (float4)0.0039216;
+    dst_f8->y = src_f8->y * *alpha_f4 + *beta_f4 * (float4)0.0039216;
+}
+
+template <typename T>
+__global__ void brightness_pkd_tensor(T *srcPtr,
+                                      int nStrideSrc,
+                                      int hStrideSrc,
+                                      T *dstPtr,
+                                      int nStrideDst,
+                                      int hStrideDst,
+                                      float *alpha,
+                                      float *beta,
+                                      RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth * 3))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x * 3);
+    uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x;
+
+    float4 alpha_f4 = (float4)alpha[id_z];
+    float4 beta_f4 = (float4)beta[id_z];
+
+    d_float8 src_f8, dst_f8;
+
+    rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8);
+    brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4);
+    rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8);
+}
+
+template <typename T>
+__global__ void brightness_pln_tensor(T *srcPtr,
+                                      int nStrideSrc,
+                                      int cStrideSrc,
+                                      int hStrideSrc,
+                                      T *dstPtr,
+                                      int nStrideDst,
+                                      int cStrideDst,
+                                      int hStrideDst,
+                                      int channelsDst,
+                                      float *alpha,
+                                      float *beta,
+                                      RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+    uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x;
+
+    float4 alpha_f4 = (float4)(alpha[id_z]);
+    float4 beta_f4 = (float4)(beta[id_z]);
+
+    d_float8 src_f8, dst_f8;
+
+    rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8);
+    brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4);
+    rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8);
+
+    if (channelsDst == 3)
+    {
+        srcIdx += cStrideSrc;
+        dstIdx += cStrideDst;
+
+        rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8);
+        brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4);
+        rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8);
+
+        srcIdx += cStrideSrc;
+        dstIdx += cStrideDst;
+
+        rpp_hip_load8_and_unpack_to_float8(srcPtr, srcIdx, &src_f8);
+        brightness_hip_compute(srcPtr, &src_f8, &dst_f8, &alpha_f4, &beta_f4);
+        rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f8);
+    }
+}
+
+template <typename T>
+__global__ void brightness_pkd3_pln3_tensor(T *srcPtr,
+                                            int nStrideSrc,
+                                            int hStrideSrc,
+                                            T *dstPtr,
+                                            int nStrideDst,
+                                            int cStrideDst,
+                                            int hStrideDst,
+                                            float *alpha,
+                                            float *beta,
+                                            RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
+    uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x;
+
+    float4 alpha_f4 = (float4)alpha[id_z];
+    float4 beta_f4 = (float4)beta[id_z];
+
+    d_float24 src_f24, dst_f24;
+
+    rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr, srcIdx, &src_f24);
+    brightness_hip_compute(srcPtr, &src_f24.x, &dst_f24.x, &alpha_f4, &beta_f4);
+    rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f24.x);
+
+    dstIdx += cStrideDst;
+
+    brightness_hip_compute(srcPtr, &src_f24.y, &dst_f24.y, &alpha_f4, &beta_f4);
+    rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f24.y);
+
+    dstIdx += cStrideDst;
+
+    brightness_hip_compute(srcPtr, &src_f24.z, &dst_f24.z, &alpha_f4, &beta_f4);
+    rpp_hip_pack_float8_and_store8(dstPtr, dstIdx, &dst_f24.z);
+}
+
+template <typename T>
+__global__ void brightness_pln3_pkd3_tensor(T *srcPtr,
+                                            int nStrideSrc,
+                                            int cStrideSrc,
+                                            int hStrideSrc,
+                                            T *dstPtr,
+                                            int nStrideDst,
+                                            int hStrideDst,
+                                            float *alpha,
+                                            float *beta,
+                                            RpptROIPtr roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+    int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
+    int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;
+
+    if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
+    {
+        return;
+    }
+
+    uint srcIdx = (id_z * nStrideSrc) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * hStrideSrc) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
+    uint dstIdx = (id_z * nStrideDst) + (id_y * hStrideDst) + id_x * 3;
+
+    float4 alpha_f4 = (float4)(alpha[id_z]);
+    float4 beta_f4 = (float4)(beta[id_z]);
+
+    d_float24 src_f24, dst_f24;
+
+    rpp_hip_load24_pln3_and_unpack_to_float24_pkd3(srcPtr, srcIdx, cStrideSrc, &src_f24);
+    brightness_hip_compute(srcPtr, &src_f24.x, &dst_f24.x, &alpha_f4, &beta_f4);
+    brightness_hip_compute(srcPtr, &src_f24.y, &dst_f24.y, &alpha_f4, &beta_f4);
+    brightness_hip_compute(srcPtr, &src_f24.z, &dst_f24.z, &alpha_f4, &beta_f4);
+    rpp_hip_pack_float24_and_store24(dstPtr, dstIdx, &dst_f24);
+}
+
+template <typename T>
+RppStatus hip_exec_brightness_tensor(T *srcPtr,
+                                     RpptDescPtr srcDescPtr,
+                                     T *dstPtr,
+                                     RpptDescPtr dstDescPtr,
+                                     RpptROIPtr roiTensorPtrSrc,
+                                     rpp::Handle& handle)
+{
+    int localThreads_x = 16;
+    int localThreads_y = 16;
+    int localThreads_z = 1;
+    int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
+    int globalThreads_y = dstDescPtr->h;
+    int globalThreads_z = handle.GetBatchSize();
+
+    if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
+    {
+        hipLaunchKernelGGL(brightness_pkd_tensor,
+                           dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcDescPtr->strides.nStride,
+                           srcDescPtr->strides.hStride,
+                           dstPtr,
+                           dstDescPtr->strides.nStride,
+                           dstDescPtr->strides.hStride,
+                           handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
+                           handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
+    {
+        hipLaunchKernelGGL(brightness_pln_tensor,
+                           dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                           dim3(localThreads_x, localThreads_y, localThreads_z),
+                           0,
+                           handle.GetStream(),
+                           srcPtr,
+                           srcDescPtr->strides.nStride,
+                           srcDescPtr->strides.cStride,
+                           srcDescPtr->strides.hStride,
+                           dstPtr,
+                           dstDescPtr->strides.nStride,
+                           dstDescPtr->strides.cStride,
+                           dstDescPtr->strides.hStride,
+                           dstDescPtr->c,
+                           handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
+                           handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem,
+                           roiTensorPtrSrc);
+    }
+    else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
+    {
+        if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
+        {
+            hipLaunchKernelGGL(brightness_pkd3_pln3_tensor,
+                               dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               srcDescPtr->strides.nStride,
+                               srcDescPtr->strides.hStride,
+                               dstPtr,
+                               dstDescPtr->strides.nStride,
+                               dstDescPtr->strides.cStride,
+                               dstDescPtr->strides.hStride,
+                               handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
+                               handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem,
+                               roiTensorPtrSrc);
+        }
+        else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
+        {
+            globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
+            hipLaunchKernelGGL(brightness_pln3_pkd3_tensor,
+                               dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                               dim3(localThreads_x, localThreads_y, localThreads_z),
+                               0,
+                               handle.GetStream(),
+                               srcPtr,
+                               srcDescPtr->strides.nStride,
+                               srcDescPtr->strides.cStride,
+                               srcDescPtr->strides.hStride,
+                               dstPtr,
+                               dstDescPtr->strides.nStride,
+                               dstDescPtr->strides.hStride,
+                               handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
+                               handle.GetInitHandle()->mem.mgpu.floatArr[1].floatmem,
+                               roiTensorPtrSrc);
+        }
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/src/modules/hip/kernel/roi_conversion.hpp b/src/modules/hip/kernel/roi_conversion.hpp
new file mode 100644
index 000000000..8f61e4cc2
--- /dev/null
+++ b/src/modules/hip/kernel/roi_conversion.hpp
@@ -0,0 +1,32 @@
+#include <hip/hip_runtime.h>
+
+extern "C" __global__ void roi_converison_ltrb_to_xywh(int *roiTensorPtrSrc)
+{
+    int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 4;
+
+    int4 *roiTensorPtrSrc_i4;
+    roiTensorPtrSrc_i4 = (int4 *)&roiTensorPtrSrc[id_x];
+
+    roiTensorPtrSrc_i4->z -= (roiTensorPtrSrc_i4->x - 1);
+    roiTensorPtrSrc_i4->w -= (roiTensorPtrSrc_i4->y - 1);
+}
+
+RppStatus hip_exec_roi_converison_ltrb_to_xywh(RpptROIPtr roiTensorPtrSrc,
+                                               rpp::Handle& handle)
+{
+    int localThreads_x = 256;
+    int localThreads_y = 1;
+    int localThreads_z = 1;
+    int globalThreads_x = handle.GetBatchSize();
+    int globalThreads_y = 1;
+    int globalThreads_z = 1;
+
+    hipLaunchKernelGGL(roi_converison_ltrb_to_xywh,
+                       dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
+                       dim3(localThreads_x, localThreads_y, localThreads_z),
+                       0,
+                       handle.GetStream(),
+                       (int *) roiTensorPtrSrc);
+
+    return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/kernel/rpp_hip_host_decls.hpp b/src/modules/hip/kernel/rpp_hip_host_decls.hpp
index 3d8f77e64..0cd13a1d6 100644
--- a/src/modules/hip/kernel/rpp_hip_host_decls.hpp
+++ b/src/modules/hip/kernel/rpp_hip_host_decls.hpp
@@ -187,5 +187,9 @@ RppStatus hip_exec_thresholding_batch(Rpp8u *srcPtr, Rpp8u *dstPtr, rpp::Handle&
 RppStatus hip_exec_min_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width);
 RppStatus hip_exec_max_batch(Rpp8u *srcPtr1, Rpp8u *srcPtr2, Rpp8u *dstPtr, rpp::Handle& handle, RppiChnFormat chnFormat, Rpp32u channel, Rpp32s plnpkdind, Rpp32u max_height, Rpp32u max_width);
 
+// helpers
+
+RppStatus hip_exec_roi_converison_ltrb_to_xywh(RpptROIPtr roiTensorPtrSrc, rpp::Handle& handle);
+
 
 #endif //RPP_HIP_HOST_DECLS_H
\ No newline at end of file
diff --git a/src/modules/rppi_validate.hpp b/src/modules/rppi_validate.hpp
index 62f1f1fb5..8ec8071a9 100644
--- a/src/modules/rppi_validate.hpp
+++ b/src/modules/rppi_validate.hpp
@@ -12,82 +12,109 @@
 #include <hip/hip_runtime_api.h>
 #endif
 
+inline RppLayoutParams get_layout_params(RpptLayout layout, Rpp32u channels)
+{
+    RppLayoutParams layoutParams;
+    if(layout == RpptLayout::NCHW)
+    {
+        if (channels == 1) // PLN1
+        {
+            layoutParams.channelParam = 1;
+            layoutParams.bufferMultiplier = 1;
+        }
+        else if (channels == 3) // PLN3
+        {
+            layoutParams.channelParam = 3;
+            layoutParams.bufferMultiplier = 1;
+        }
+    }
+    else if(layout == RpptLayout::NHWC)
+    {
+        if (channels == 3) // PKD3
+        {
+            layoutParams.channelParam = 1;
+            layoutParams.bufferMultiplier = 3;
+        }
+    }
+
+    return layoutParams;
+}
+
 inline void copy_srcSize(RppiSize srcSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize.height;
- 	 	 handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize.width;
+           handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize.height;
+           handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize.width;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.width, 0, NULL, NULL);
     }
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.width, handle.GetInitHandle()->mem.mgpu.csrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_srcSize(RppiSize *srcSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize[i].height;
- 	 	 handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize[i].width;
+           handle.GetInitHandle()->mem.mgpu.csrcSize.height[i] = srcSize[i].height;
+           handle.GetInitHandle()->mem.mgpu.csrcSize.width[i] = srcSize[i].width;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.srcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.csrcSize.width, 0, NULL, NULL);
     }
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.height, handle.GetInitHandle()->mem.mgpu.csrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.srcSize.width, handle.GetInitHandle()->mem.mgpu.csrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_dstSize(RppiSize *dstSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize[i].height;
- 	 	 handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize[i].width;
+           handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize[i].height;
+           handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize[i].width;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.width, 0, NULL, NULL);
     }
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.height, handle.GetInitHandle()->mem.mgpu.cdstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.width, handle.GetInitHandle()->mem.mgpu.cdstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
-
 inline void copy_host_srcSize(RppiSize srcSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mcpu.srcSize[i].height = srcSize.height;
- 	 	 handle.GetInitHandle()->mem.mcpu.srcSize[i].width = srcSize.width;
-    }   
+           handle.GetInitHandle()->mem.mcpu.srcSize[i].height = srcSize.height;
+           handle.GetInitHandle()->mem.mcpu.srcSize[i].width = srcSize.width;
+    }
 }
 
 inline void copy_host_dstSize(RppiSize dstSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mcpu.dstSize[i].height = dstSize.height;
- 	 	 handle.GetInitHandle()->mem.mcpu.dstSize[i].width = dstSize.width;
+           handle.GetInitHandle()->mem.mcpu.dstSize[i].height = dstSize.height;
+           handle.GetInitHandle()->mem.mcpu.dstSize[i].width = dstSize.width;
     }
 }
 
@@ -95,38 +122,39 @@ inline void copy_host_maxSrcSize(RppiSize maxSrcSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].height = maxSrcSize.height;
- 	 	 handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].width = maxSrcSize.width;
+           handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].height = maxSrcSize.height;
+           handle.GetInitHandle()->mem.mcpu.maxSrcSize[i].width = maxSrcSize.width;
     }
 }
+
 inline void copy_host_maxDstSize(RppiSize maxDstSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mcpu.maxDstSize[i].height = maxDstSize.height;
- 	 	 handle.GetInitHandle()->mem.mcpu.maxDstSize[i].width = maxDstSize.width;
+           handle.GetInitHandle()->mem.mcpu.maxDstSize[i].height = maxDstSize.height;
+           handle.GetInitHandle()->mem.mcpu.maxDstSize[i].width = maxDstSize.width;
     }
 }
+
 inline void copy_dstSize(RppiSize dstSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize() ; i++)
     {
- 	 	 handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize.height;
- 	 	 handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize.width;
+           handle.GetInitHandle()->mem.mgpu.cdstSize.height[i] = dstSize.height;
+           handle.GetInitHandle()->mem.mgpu.cdstSize.width[i] = dstSize.width;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cdstSize.width, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.height, handle.GetInitHandle()->mem.mgpu.cdstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.dstSize.width, handle.GetInitHandle()->mem.mgpu.cdstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
 
     }
-    #endif
+#endif
 }
 
 inline void copy_host_roi(RppiROI roiPoints, rpp::Handle& handle)
@@ -155,7 +183,7 @@ inline void copy_roi(RppiROI roiPoints, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize(); i++)
     {
-        #if defined(OCL_COMPILE) || defined (HIP_COMPILE)
+#if defined(OCL_COMPILE) || defined (HIP_COMPILE)
         {
             if(roiPoints.roiHeight == 0 && roiPoints.roiWidth == 0)
             {
@@ -168,24 +196,23 @@ inline void copy_roi(RppiROI roiPoints, rpp::Handle& handle)
                 handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints.roiWidth + roiPoints.x;
             }
         }
-        #else
+#else
         {
             handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight[i] = roiPoints.roiHeight;
             handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints.roiWidth;
         }
-        #endif
+#endif
         handle.GetInitHandle()->mem.mgpu.croiPoints.x[i] = roiPoints.x;
         handle.GetInitHandle()->mem.mgpu.croiPoints.y[i] = roiPoints.y;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.x, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.x, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.y, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.y, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
@@ -193,14 +220,14 @@ inline void copy_roi(RppiROI roiPoints, rpp::Handle& handle)
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.y, handle.GetInitHandle()->mem.mgpu.croiPoints.y, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
 
     }
-    #endif
+#endif
 }
 
 inline void copy_roi(RppiROI *roiPoints, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize(); i++)
     {
-        #if defined(OCL_COMPILE) || defined (HIP_COMPILE)
+#if defined(OCL_COMPILE) || defined (HIP_COMPILE)
         {
             if(roiPoints[i].roiHeight == 0 && roiPoints[i].roiWidth == 0)
             {
@@ -213,31 +240,30 @@ inline void copy_roi(RppiROI *roiPoints, rpp::Handle& handle)
                 handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints[i].roiWidth + roiPoints[i].x;
             }
         }
-        #else
+#else
         {
             handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight[i] = roiPoints[i].roiHeight;
             handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth[i] = roiPoints[i].roiWidth;
         }
-        #endif
+#endif
         handle.GetInitHandle()->mem.mgpu.croiPoints.x[i] = roiPoints[i].x;
         handle.GetInitHandle()->mem.mgpu.croiPoints.y[i] = roiPoints[i].y;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.x, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.x, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.roiPoints.y, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.croiPoints.y, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiHeight, handle.GetInitHandle()->mem.mgpu.croiPoints.roiHeight, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.roiWidth, handle.GetInitHandle()->mem.mgpu.croiPoints.roiWidth, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.x, handle.GetInitHandle()->mem.mgpu.croiPoints.x, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.roiPoints.y, handle.GetInitHandle()->mem.mgpu.croiPoints.y, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_float(float param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -246,16 +272,15 @@ inline void copy_param_float(float param, rpp::Handle& handle, Rpp32u paramIndex
     {
         handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem[i] = param;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, CL_FALSE, 0, sizeof(Rpp32f) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, sizeof(Rpp32f) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_float(float *param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -264,16 +289,15 @@ inline void copy_param_float(float *param, rpp::Handle& handle, Rpp32u paramInde
     {
         handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem[i] = param[i];
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, CL_FALSE, 0, sizeof(Rpp32f) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.floatArr[paramIndex].floatmem, handle.GetInitHandle()->mem.mcpu.floatArr[paramIndex].floatmem, sizeof(Rpp32f) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_uint(uint param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -282,16 +306,15 @@ inline void copy_param_uint(uint param, rpp::Handle& handle, Rpp32u paramIndex)
     {
         handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem[i] = param;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_uint(uint *param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -300,16 +323,15 @@ inline void copy_param_uint(uint *param, rpp::Handle& handle, Rpp32u paramIndex)
     {
         handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem[i] = param[i];
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.uintArr[paramIndex].uintmem, handle.GetInitHandle()->mem.mcpu.uintArr[paramIndex].uintmem, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_int(int param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -318,16 +340,15 @@ inline void copy_param_int(int param, rpp::Handle& handle, Rpp32u paramIndex)
     {
         handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem[i] = param;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, CL_FALSE, 0, sizeof(Rpp32s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, sizeof(Rpp32s) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_int(int *param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -336,16 +357,15 @@ inline void copy_param_int(int *param, rpp::Handle& handle, Rpp32u paramIndex)
     {
         handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem[i] = param[i];
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, CL_FALSE, 0, sizeof(Rpp32s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.intArr[paramIndex].intmem, handle.GetInitHandle()->mem.mcpu.intArr[paramIndex].intmem, sizeof(Rpp32s) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_uchar(Rpp8u param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -354,16 +374,15 @@ inline void copy_param_uchar(Rpp8u param, rpp::Handle& handle, Rpp32u paramIndex
     {
         handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem[i] = param;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, CL_FALSE, 0, sizeof(Rpp8u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, sizeof(Rpp8u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_uchar(Rpp8u *param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -372,16 +391,15 @@ inline void copy_param_uchar(Rpp8u *param, rpp::Handle& handle, Rpp32u paramInde
     {
         handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem[i] = param[i];
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, CL_FALSE, 0, sizeof(Rpp8u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.ucharArr[paramIndex].ucharmem, handle.GetInitHandle()->mem.mcpu.ucharArr[paramIndex].ucharmem, sizeof(Rpp8u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_char(char param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -390,16 +408,15 @@ inline void copy_param_char(char param, rpp::Handle& handle, Rpp32u paramIndex)
     {
         handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem[i] = param;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, CL_FALSE, 0, sizeof(Rpp8s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, sizeof(Rpp8s) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_param_char(char *param, rpp::Handle& handle, Rpp32u paramIndex)
@@ -408,19 +425,15 @@ inline void copy_param_char(char *param, rpp::Handle& handle, Rpp32u paramIndex)
     {
         handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem[i] = param[i];
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, CL_FALSE, 0, sizeof(Rpp8s) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.charArr[paramIndex].charmem, handle.GetInitHandle()->mem.mcpu.charArr[paramIndex].charmem, sizeof(Rpp8s) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
-}
-inline void copy_host_srcMaxSize(rpp::Handle& handle){
-
+#endif
 }
 
 inline void copy_srcMaxSize(rpp::Handle& handle)
@@ -430,20 +443,17 @@ inline void copy_srcMaxSize(rpp::Handle& handle)
         handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] = handle.GetInitHandle()->mem.mgpu.csrcSize.height[i];
         handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] = handle.GetInitHandle()->mem.mgpu.csrcSize.width[i];
     }
-    // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.height, handle.GetInitHandle()->mem.mcpu.srcSize.height, sizeof(Rpp32u) * handle.GetBatchSize());
-    // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.width, handle.GetInitHandle()->mem.mcpu.srcSize.width, sizeof(Rpp32u) * handle.GetBatchSize());
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_dstMaxSize(rpp::Handle& handle)
@@ -453,23 +463,19 @@ inline void copy_dstMaxSize(rpp::Handle& handle)
         handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height[i] = handle.GetInitHandle()->mem.mgpu.cdstSize.height[i];
         handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width[i] = handle.GetInitHandle()->mem.mgpu.cdstSize.width[i];
     }
-    // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.height, handle.GetInitHandle()->mem.mcpu.srcSize.height, sizeof(Rpp32u) * handle.GetBatchSize());
-    // memcpy(handle.GetInitHandle()->mem.mcpu.maxSrcSize.width, handle.GetInitHandle()->mem.mcpu.srcSize.width, sizeof(Rpp32u) * handle.GetBatchSize());
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.height, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.width, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
-
 inline void copy_srcMaxSize(RppiSize maxSrcSize, rpp::Handle& handle)
 {
     for(int i = 0; i < handle.GetBatchSize(); i++)
@@ -477,18 +483,17 @@ inline void copy_srcMaxSize(RppiSize maxSrcSize, rpp::Handle& handle)
         handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] = maxSrcSize.height;
         handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] = maxSrcSize.width;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.height, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxSrcSize.width, handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
 inline void copy_dstMaxSize(RppiSize maxDstSize, rpp::Handle& handle)
@@ -498,28 +503,26 @@ inline void copy_dstMaxSize(RppiSize maxDstSize, rpp::Handle& handle)
         handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height[i] = maxDstSize.height;
         handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width[i] = maxDstSize.width;
     }
-    #ifdef OCL_COMPILE
+#ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.height, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, 0, NULL, NULL);
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.maxDstSize.width, CL_FALSE, 0, sizeof(Rpp32u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, 0, NULL, NULL);
     }
-    // for hip
-    #elif defined(HIP_COMPILE)
+#elif defined(HIP_COMPILE)
     {
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.height, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
         hipMemcpy(handle.GetInitHandle()->mem.mgpu.maxDstSize.width, handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width, sizeof(Rpp32u) * handle.GetBatchSize(), hipMemcpyHostToDevice);
     }
-    #endif
+#endif
 }
 
-
 inline void get_srcBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChnFormat chnFormat, bool is_padded = true)
 {
     int i;
     handle.GetInitHandle()->mem.mcpu.srcBatchIndex[0] = 0;
     for(i =0; i < handle.GetBatchSize() - 1 ; i++)
     {
-       handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i+1] = handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i] + handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] * channel;
+        handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i+1] = handle.GetInitHandle()->mem.mcpu.srcBatchIndex[i] + handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i] * channel;
     }
     for(i =0; i < handle.GetBatchSize() ; i++)
     {
@@ -535,7 +538,6 @@ inline void get_srcBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChn
                 handle.GetInitHandle()->mem.mcpu.inc[i] = handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxSrcSize.width[i];
         }
     }
-
 #ifdef OCL_COMPILE
     {
 
@@ -572,7 +574,6 @@ inline void get_dstBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChn
                 handle.GetInitHandle()->mem.mcpu.dstInc[i] = handle.GetInitHandle()->mem.mgpu.cmaxDstSize.height[i] * handle.GetInitHandle()->mem.mgpu.cmaxDstSize.width[i];
         }
     }
-
 #ifdef OCL_COMPILE
     {
         clEnqueueWriteBuffer(handle.GetStream(), handle.GetInitHandle()->mem.mgpu.dstBatchIndex, CL_FALSE, 0, sizeof(Rpp64u) * handle.GetBatchSize(), handle.GetInitHandle()->mem.mcpu.dstBatchIndex, 0, NULL, NULL);
@@ -586,7 +587,6 @@ inline void get_dstBatchIndex(rpp::Handle& handle, unsigned int channel, RppiChn
 #endif
 }
 
-
 template <typename T>
 inline void copy_luptr(Rpp8u *luptr,Rpp8u * batch_luptr,Rpp32u nbatchSize, int channel)
 {
@@ -601,7 +601,6 @@ inline void copy_luptr(Rpp8u *luptr,Rpp8u * batch_luptr,Rpp32u nbatchSize, int c
     }
 }
 
-
 template <typename T>
 inline void copy_kernel(Rpp32f *kernel,Rpp32f * batch_kernel, Rpp32u nbatchSize, unsigned int size)
 {
@@ -616,167 +615,165 @@ inline void copy_kernel(Rpp32f *kernel,Rpp32f * batch_kernel, Rpp32u nbatchSize,
     }
 }
 
-
-inline void validate_image_size(RppiSize imgSize){
-    if(!(imgSize.width >= 0) || !(imgSize.height >= 0)){
-       // std::cerr<<"\nImage width and height should be positive "<<std::endl;
+inline void validate_image_size(RppiSize imgSize)
+{
+    if(!(imgSize.width >= 0) || !(imgSize.height >= 0))
+    {
         exit(0);
     }
 }
 
-inline void validate_float_range(Rpp32f min, Rpp32f max, Rpp32f *value) {
-    if( !(*value <= max) || !(*value >= min)){
-        //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-        //std::cerr<<"\nValue should be between "<<min<<" and "<<max<<std::endl;
+inline void validate_float_range(Rpp32f min, Rpp32f max, Rpp32f *value)
+{
+    if(!(*value <= max) || !(*value >= min))
+    {
         *value = max;
-        //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
 
-inline void validate_double_range(Rpp64f min, Rpp64f max, Rpp64f *value) {
-    if( !(*value <= max) || !(*value >= min)){
-        //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-        //std::cerr<<"\nValue should be between "<<min<<" and "<<max<<std::endl;
+inline void validate_double_range(Rpp64f min, Rpp64f max, Rpp64f *value)
+{
+    if(!(*value <= max) || !(*value >= min))
+    {
         *value = max;
-        //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
 
-inline void validate_int_range(Rpp32s min, Rpp32s max, Rpp32s *value) {
-    if( !(*value <= max) || !(*value >= min)){
-        //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-        //std::cerr<<"\nValue should be between "<<min<<" and "<<max<<std::endl;
+inline void validate_int_range(Rpp32s min, Rpp32s max, Rpp32s *value)
+{
+    if(!(*value <= max) || !(*value >= min))
+    {
         *value = max;
-        //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
-inline void validate_unsigned_int_range(Rpp32u min, Rpp32u max, Rpp32u *value) {
-    if( !(*value <= max) || !(*value >= min)){
-        //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-        //std::cerr<<"\nValue should be between "<<min<<" and "<<max<<std::endl;
+
+inline void validate_unsigned_int_range(Rpp32u min, Rpp32u max, Rpp32u *value)
+{
+    if(!(*value <= max) || !(*value >= min))
+    {
         *value = max;
-        //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
 
-inline void validate_int_max(Rpp32s max, Rpp32s *value) {
-    if( !(*value <= max) ){
-       //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-       //std::cerr<<"\nValue should be less than "<<max<<std::endl;
+inline void validate_int_max(Rpp32s max, Rpp32s *value)
+{
+    if(!(*value <= max))
+    {
        *value = max;
-       //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
 
-inline void validate_unsigned_int_max(Rpp32u max, Rpp32u *value) {
-    if( !(*value <= max) ){
-       //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-       //std::cerr<<"\nValue should be less than "<<max<<std::endl;
+inline void validate_unsigned_int_max(Rpp32u max, Rpp32u *value)
+{
+    if(!(*value <= max))
+    {
        *value = max;
-       //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
 
-inline void validate_int_min(Rpp32s min, Rpp32s *value) {
-    if( !(*value >= min) ){
-       //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-       //std::cerr<<"\nValue should be greater than "<<min<<std::endl;
+inline void validate_int_min(Rpp32s min, Rpp32s *value)
+{
+    if(!(*value >= min))
+    {
        *value = min;
-       //std::cerr<<"\nSetting the value to "<<min<<std::endl;
     }
 }
-inline void validate_unsigned_int_min(Rpp32u min, Rpp32u *value) {
-    if( !(*value >= min) ){
-       //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-       //std::cerr<<"\nValue should be greater than "<<min<<std::endl;
+
+inline void validate_unsigned_int_min(Rpp32u min, Rpp32u *value)
+{
+    if(!(*value >= min))
+    {
        *value = min;
-       //std::cerr<<"\nSetting the value to "<<min<<std::endl;
     }
 }
-inline void validate_float_max(Rpp32f max, Rpp32f *value) {
-    if( !(*value <= max) ){
-       //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-       //std::cerr<<"\nValue should be less than "<<max<<std::endl;
+
+inline void validate_float_max(Rpp32f max, Rpp32f *value)
+{
+    if(!(*value <= max))
+    {
        *value = max;
-       //std::cerr<<"\nSetting the value to "<<max<<std::endl;
     }
 }
 
-inline void validate_float_min(Rpp32f min, Rpp32f *value) {
-    if( !(*value >= min) ){
-       //std::cerr<<"\nOut of bounds: "<<*value<<std::endl;
-       //std::cerr<<"\nValue should be greater than "<<min<<std::endl;
+inline void validate_float_min(Rpp32f min, Rpp32f *value)
+{
+    if(!(*value >= min))
+    {
        *value = min;
-       //std::cerr<<"\nSetting the value to "<<min<<std::endl;
     }
 }
 
-inline void validate_affine_matrix(Rpp32f* affine){
-    if((affine[0] * affine[4] - affine[1] * affine[3]) == 0){
-        //std::cerr<<"\n Affine matrix is not valid--\n Identity matrix is considered instead"
-                                   // << std::endl;
+inline void validate_affine_matrix(Rpp32f* affine)
+{
+    if((affine[0] * affine[4] - affine[1] * affine[3]) == 0)
+    {
         affine[0] = 1;
         affine[1] = 0;
         affine[3] = 0;
         affine[4] = 1;
     }
 }
-inline void brightness_validate(RppiSize srcSize ,Rpp32f alpha ,Rpp32f beta )
+
+inline void brightness_validate(RppiSize srcSize, Rpp32f alpha, Rpp32f beta)
 {
-	validate_image_size(srcSize);
-	validate_float_range(0, 2, &alpha);
-	validate_float_range(0, 255, &beta);
+    validate_image_size(srcSize);
+    validate_float_range(0, 2, &alpha);
+    validate_float_range(0, 255, &beta);
 }
-inline void brightness_validate(RppiSize srcSize ,Rpp32f alpha ,Rpp32f beta ,Rpp32u nbatchSize)
+
+inline void brightness_validate(RppiSize srcSize, Rpp32f alpha, Rpp32f beta, Rpp32u nbatchSize)
 {
     validate_image_size(srcSize);
-	validate_float_range(0, 2, &alpha);
-	validate_float_range(0, 255, &beta);
+    validate_float_range(0, 2, &alpha);
+    validate_float_range(0, 255, &beta);
 }
 
-inline void brightness_validate(RppiSize *srcSize ,Rpp32f alpha ,Rpp32f beta ,Rpp32u nbatchSize)
+inline void brightness_validate(RppiSize *srcSize, Rpp32f alpha, Rpp32f beta, Rpp32u nbatchSize)
 {
-	for(int i = 0; i < nbatchSize; i++)
-	{
- 	validate_image_size(srcSize[i]);
-	 }
-	validate_float_range(0, 2, &alpha);
-	validate_float_range(0, 255, &beta);
+    for(int i = 0; i < nbatchSize; i++)
+    {
+        validate_image_size(srcSize[i]);
+    }
+    validate_float_range(0, 2, &alpha);
+    validate_float_range(0, 255, &beta);
 }
 
-inline void brightness_validate(RppiSize srcSize ,Rpp32f *alpha ,Rpp32f *beta ,Rpp32u nbatchSize)
+inline void brightness_validate(RppiSize srcSize, Rpp32f *alpha, Rpp32f *beta, Rpp32u nbatchSize)
 {
     validate_image_size(srcSize);
-	for(int i = 0; i < nbatchSize; i++)
-	{
+    for(int i = 0; i < nbatchSize; i++)
+    {
         validate_float_range(0, 2, &alpha[i]);
         validate_float_range(0, 255, &beta[i]);
     }
 }
-inline void brightness_validate(RppiSize *srcSize ,Rpp32f *alpha ,Rpp32f *beta ,Rpp32u nbatchSize)
+
+inline void brightness_validate(RppiSize *srcSize, Rpp32f *alpha, Rpp32f *beta, Rpp32u nbatchSize)
 {
-	for(int i = 0; i < nbatchSize; i++)
-	{
+    for(int i = 0; i < nbatchSize; i++)
+    {
         validate_image_size(srcSize[i]);
         validate_float_range(0, 2, &alpha[i]);
         validate_float_range(0, 255, &beta[i]);
-	 }
+    }
 }
-inline void histogram_equalize_validate(RppiSize srcSize )
+
+inline void histogram_equalize_validate(RppiSize srcSize)
 {
-	validate_image_size(srcSize);
+    validate_image_size(srcSize);
 }
 
-inline void histogram_equalize_validate(RppiSize *srcSize ,Rpp32u nbatchSize)
+inline void histogram_equalize_validate(RppiSize *srcSize, Rpp32u nbatchSize)
 {
-	for(int i = 0; i < nbatchSize; i++)
-	{
- 	validate_image_size(srcSize[i]);
-	 }
+    for(int i = 0; i < nbatchSize; i++)
+    {
+        validate_image_size(srcSize[i]);
+    }
 }
-inline void histogram_equalize_validate(RppiSize srcSize ,Rpp32u nbatchSize)
+
+inline void histogram_equalize_validate(RppiSize srcSize, Rpp32u nbatchSize)
 {
- 	validate_image_size(srcSize);
+    validate_image_size(srcSize);
 }
 
 #endif
diff --git a/src/modules/rppt_tensor_augmentations.cpp b/src/modules/rppt_tensor_augmentations.cpp
new file mode 100644
index 000000000..ed80b8ec0
--- /dev/null
+++ b/src/modules/rppt_tensor_augmentations.cpp
@@ -0,0 +1,182 @@
+/*
+Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <rppt_tensor_augmentations.h>
+#include <rppdefs.h>
+#include "rppi_validate.hpp"
+
+#ifdef HIP_COMPILE
+    #include "hip/hip_tensor_augmentations.hpp"
+#elif defined(OCL_COMPILE)
+    #include <cl/rpp_cl_common.hpp>
+    #include "cl/cl_declarations.hpp"
+#endif //backend
+
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <chrono>
+using namespace std::chrono;
+
+#include "cpu/host_tensor_augmentations.hpp"
+
+RppStatus
+rppt_brightness_gpu(RppPtr_t srcPtr,
+                    RpptDescPtr srcDescPtr,
+                    RppPtr_t dstPtr,
+                    RpptDescPtr dstDescPtr,
+                    Rpp32f *alphaTensor,
+                    Rpp32f *betaTensor,
+                    RpptROIPtr roiTensorPtrSrc,
+                    RpptRoiType roiType,
+                    rppHandle_t rppHandle)
+{
+#ifdef OCL_COMPILE
+
+#elif defined (HIP_COMPILE)
+
+    Rpp32u paramIndex = 0;
+    copy_param_float(alphaTensor, rpp::deref(rppHandle), paramIndex++);
+    copy_param_float(betaTensor, rpp::deref(rppHandle), paramIndex++);
+
+    if (srcDescPtr->dataType == RpptDataType::U8)
+    {
+        if (dstDescPtr->dataType == RpptDataType::U8)
+        {
+            brightness_hip_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offset,
+                                  srcDescPtr,
+                                  static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offset,
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+        }
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F16)
+    {
+        if (dstDescPtr->dataType == RpptDataType::F16)
+        {
+            brightness_hip_tensor(static_cast<half*>(srcPtr) + srcDescPtr->offset,
+                                  srcDescPtr,
+                                  static_cast<half*>(dstPtr) + dstDescPtr->offset,
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+        }
+    }
+    else if (srcDescPtr->dataType == RpptDataType::F32)
+    {
+        if (dstDescPtr->dataType == RpptDataType::F32)
+        {
+            brightness_hip_tensor(static_cast<Rpp32f*>(srcPtr) + srcDescPtr->offset,
+                                  srcDescPtr,
+                                  static_cast<Rpp32f*>(dstPtr) + dstDescPtr->offset,
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+        }
+    }
+    else if (srcDescPtr->dataType == RpptDataType::I8)
+    {
+        if (dstDescPtr->dataType == RpptDataType::I8)
+        {
+            brightness_hip_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offset,
+                                  srcDescPtr,
+                                  static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offset,
+                                  dstDescPtr,
+                                  roiTensorPtrSrc,
+                                  roiType,
+                                  rpp::deref(rppHandle));
+        }
+    }
+
+#endif //BACKEND
+
+    return RPP_SUCCESS;
+}
+
+RppStatus
+rppt_brightness_host(RppPtr_t srcPtr,
+                     RpptDescPtr srcDescPtr,
+                     RppPtr_t dstPtr,
+                     RpptDescPtr dstDescPtr,
+                     Rpp32f *alphaTensor,
+                     Rpp32f *betaTensor,
+                     RpptROIPtr roiTensorPtrSrc,
+                     RpptRoiType roiType,
+                     rppHandle_t rppHandle)
+{
+    RppLayoutParams layoutParams = get_layout_params(srcDescPtr->layout, srcDescPtr->c);
+
+    if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
+    {
+        brightness_u8_u8_host_tensor(static_cast<Rpp8u*>(srcPtr) + srcDescPtr->offset,
+                                     srcDescPtr,
+                                     static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offset,
+                                     dstDescPtr,
+                                     alphaTensor,
+                                     betaTensor,
+                                     roiTensorPtrSrc,
+                                     roiType,
+                                     layoutParams);
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
+    {
+        brightness_f16_f16_host_tensor(static_cast<Rpp16f*>(srcPtr) + srcDescPtr->offset,
+                                       srcDescPtr,
+                                       static_cast<Rpp16f*>(dstPtr) + dstDescPtr->offset,
+                                       dstDescPtr,
+                                       alphaTensor,
+                                       betaTensor,
+                                       roiTensorPtrSrc,
+                                       roiType,
+                                       layoutParams);
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
+    {
+        brightness_f32_f32_host_tensor(static_cast<Rpp32f*>(srcPtr) + srcDescPtr->offset,
+                                       srcDescPtr,
+                                       static_cast<Rpp32f*>(dstPtr) + dstDescPtr->offset,
+                                       dstDescPtr,
+                                       alphaTensor,
+                                       betaTensor,
+                                       roiTensorPtrSrc,
+                                       roiType,
+                                       layoutParams);
+    }
+    else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
+    {
+        brightness_i8_i8_host_tensor(static_cast<Rpp8s*>(srcPtr) + srcDescPtr->offset,
+                                     srcDescPtr,
+                                     static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offset,
+                                     dstDescPtr,
+                                     alphaTensor,
+                                     betaTensor,
+                                     roiTensorPtrSrc,
+                                     roiType,
+                                     layoutParams);
+    }
+
+    return RPP_SUCCESS;
+}
diff --git a/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt b/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt
index bd5254b85..917c9be34 100644
--- a/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt
+++ b/utilities/rpp-performancetests/HIP_NEW/CMakeLists.txt
@@ -23,12 +23,18 @@ if (hip_FOUND)
     add_executable(BatchPD_hip_pkd3 BatchPD_hip_pkd3.cpp)
     add_executable(BatchPD_hip_pln1 BatchPD_hip_pln1.cpp)
     add_executable(BatchPD_hip_pln3 BatchPD_hip_pln3.cpp)
+    add_executable(Tensor_hip_pkd3 Tensor_hip_pkd3.cpp)
+    add_executable(Tensor_hip_pln1 Tensor_hip_pln1.cpp)
+    add_executable(Tensor_hip_pln3 Tensor_hip_pln3.cpp)
     # add_executable(Single_hip Single_hip.cpp)
     add_executable(uniqueFunctionalities_hip uniqueFunctionalities_hip.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DHIP_COMPILE=1 -DRPP_BACKEND_HIP=1 -std=c++11")
     target_link_libraries(BatchPD_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     target_link_libraries(BatchPD_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     target_link_libraries(BatchPD_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
+    target_link_libraries(Tensor_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
+    target_link_libraries(Tensor_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
+    target_link_libraries(Tensor_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     # target_link_libraries(Single_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     target_link_libraries(uniqueFunctionalities_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
 endif()
\ No newline at end of file
diff --git a/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp
new file mode 100644
index 000000000..5742e925c
--- /dev/null
+++ b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pkd3.cpp
@@ -0,0 +1,635 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <hip/hip_fp16.h>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 7;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_hip_pkd3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[6]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]);
+        printf("\ncase number (1:7) = %s", argv[5]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    int ip_bitDepth = atoi(argv[3]);
+    unsigned int outputFormatToggle = atoi(argv[4]);
+    int test_case = atoi(argv[5]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HIP_PKD3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        // outputFormatToggle = 0;
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst;
+    hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI));
+    hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+    srcDescPtr->c = ip_channel;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+    dstDescPtr->c = ip_channel;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w;
+    srcDescPtr->strides.wStride = ip_channel;
+    srcDescPtr->strides.cStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize 8u host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += srcDescPtr->strides.hStride;
+            input_second_temp += srcDescPtr->strides.hStride;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths and copy to hip buffers
+
+    half *inputf16, *inputf16_second, *outputf16;
+    Rpp32f *inputf32, *inputf32_second, *outputf32;
+    Rpp8s *inputi8, *inputi8_second, *outputi8;
+    int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second;
+    int *d_output, *d_outputf16, *d_outputf32, *d_outputi8;
+
+    if (ip_bitDepth == 0)
+    {
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        inputf16 = (half *)calloc(ioBufferSize, sizeof(half));
+        inputf16_second = (half *)calloc(ioBufferSize, sizeof(half));
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        half *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = (half)(((float)*inputTemp) / 255.0);
+            *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0);
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf16, ioBufferSize * sizeof(half));
+        hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+
+        hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages);
+
+    clock_t start, end;
+    double max_time_used = 0, min_time_used = 500, avg_time_used = 0;
+
+    string test_case_name;
+
+    printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages);
+
+    for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++)
+    {
+        double gpu_time_used;
+        switch (test_case)
+        {
+        case 0:
+        {
+            test_case_name = "brightness";
+
+            Rpp32f alpha[images];
+            Rpp32f beta[images];
+            for (i = 0; i < images; i++)
+            {
+                alpha[i] = 1.75;
+                beta[i] = 50;
+
+                // xywhROI override sample
+                // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+                // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+                // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+                // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+                // ltrbROI override sample
+                // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+                // roiTensorPtrSrc[i].ltrbROI.lt.y = 30;
+                // roiTensorPtrSrc[i].ltrbROI.rb.x = 210;
+                // roiTensorPtrSrc[i].ltrbROI.rb.y = 210;
+            }
+
+            // Change RpptRoiType for ltrbROI override sample
+            // roiTypeSrc = RpptRoiType::LTRB;
+            // roiTypeDst = RpptRoiType::LTRB;
+
+
+            hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice);
+
+            start = clock();
+
+            if (ip_bitDepth == 0)
+                rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 1)
+                rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 2)
+                rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 3)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 4)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 5)
+                rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 6)
+                missingFuncFlag = 1;
+            else
+                missingFuncFlag = 1;
+
+            end = clock();
+
+            break;
+        }
+        default:
+            missingFuncFlag = 1;
+            break;
+        }
+
+        if (missingFuncFlag == 1)
+        {
+            printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+            return -1;
+        }
+
+        // Display measured times
+
+        gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+        if (gpu_time_used > max_time_used)
+            max_time_used = gpu_time_used;
+        if (gpu_time_used < min_time_used)
+            min_time_used = gpu_time_used;
+        avg_time_used += gpu_time_used;
+    }
+
+    avg_time_used /= 100;
+    cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl;
+
+    rppDestroyGPU(handle);
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    hipFree(d_roiTensorPtrSrc);
+    hipFree(d_roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+
+    if (ip_bitDepth == 0)
+    {
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_output);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        free(inputf16);
+        free(inputf16_second);
+        free(outputf16);
+        hipFree(d_inputf16);
+        hipFree(d_inputf16_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        free(inputf32);
+        free(inputf32_second);
+        free(outputf32);
+        hipFree(d_inputf32);
+        hipFree(d_inputf32_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        free(outputf16);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        free(outputf32);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        free(inputi8);
+        free(inputi8_second);
+        free(outputi8);
+        hipFree(d_inputi8);
+        hipFree(d_inputi8_second);
+        hipFree(d_outputi8);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        free(outputi8);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputi8);
+    }
+
+    return 0;
+}
diff --git a/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp
new file mode 100644
index 000000000..8b402ce2c
--- /dev/null
+++ b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln1.cpp
@@ -0,0 +1,632 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <hip/hip_fp16.h>
+#include <fstream>
+#include "helpers/testSuite_helper.hpp"
+
+using namespace cv;
+using namespace std;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 7;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pln1 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+    if (atoi(argv[5]) != 0)
+    {
+        printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n");
+        return -1;
+    }
+
+    if (atoi(argv[6]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]);
+        printf("\ncase number (1:7) = %s", argv[5]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    int ip_bitDepth = atoi(argv[3]);
+    unsigned int outputFormatToggle = atoi(argv[4]);
+    int test_case = atoi(argv[5]);
+
+    int ip_channel = 1;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PLN1_toPLN1"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        outputFormatToggle = 0;
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    srcDescPtr->layout = RpptLayout::NCHW;
+    dstDescPtr->layout = RpptLayout::NCHW;
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst;
+    hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI));
+    hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 0);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 0);
+        image_second = imread(temp_second, 0);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths and copy to hip buffers
+
+    half *inputf16, *inputf16_second, *outputf16;
+    Rpp32f *inputf32, *inputf32_second, *outputf32;
+    Rpp8s *inputi8, *inputi8_second, *outputi8;
+    int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second;
+    int *d_output, *d_outputf16, *d_outputf32, *d_outputi8;
+
+    if (ip_bitDepth == 0)
+    {
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        inputf16 = (half *)calloc(ioBufferSize, sizeof(half));
+        inputf16_second = (half *)calloc(ioBufferSize, sizeof(half));
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        half *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = (half)(((float)*inputTemp) / 255.0);
+            *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0);
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf16, ioBufferSize * sizeof(half));
+        hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+
+        hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages);
+
+    clock_t start, end;
+    double max_time_used = 0, min_time_used = 500, avg_time_used = 0;
+
+    string test_case_name;
+
+    printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages);
+
+    for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++)
+    {
+        double gpu_time_used;
+        switch (test_case)
+        {
+        case 0:
+        {
+            test_case_name = "brightness";
+
+            Rpp32f alpha[images];
+            Rpp32f beta[images];
+            for (i = 0; i < images; i++)
+            {
+                alpha[i] = 1.75;
+                beta[i] = 50;
+
+                // xywhROI override sample
+                // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+                // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+                // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+                // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+                // ltrbROI override sample
+                // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+                // roiTensorPtrSrc[i].ltrbROI.lt.y = 30;
+                // roiTensorPtrSrc[i].ltrbROI.rb.x = 210;
+                // roiTensorPtrSrc[i].ltrbROI.rb.y = 210;
+            }
+
+            // Change RpptRoiType for ltrbROI override sample
+            // roiTypeSrc = RpptRoiType::LTRB;
+            // roiTypeDst = RpptRoiType::LTRB;
+
+            hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice);
+
+            start = clock();
+
+            if (ip_bitDepth == 0)
+                rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 1)
+                rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 2)
+                rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 3)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 4)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 5)
+                rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 6)
+                missingFuncFlag = 1;
+            else
+                missingFuncFlag = 1;
+
+            end = clock();
+
+            break;
+        }
+        default:
+            missingFuncFlag = 1;
+            break;
+        }
+
+        if (missingFuncFlag == 1)
+        {
+            printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+            return -1;
+        }
+
+        // Display measured times
+
+        gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+        if (gpu_time_used > max_time_used)
+            max_time_used = gpu_time_used;
+        if (gpu_time_used < min_time_used)
+            min_time_used = gpu_time_used;
+        avg_time_used += gpu_time_used;
+    }
+
+    avg_time_used /= 100;
+    cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl;
+
+    rppDestroyGPU(handle);
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    hipFree(d_roiTensorPtrSrc);
+    hipFree(d_roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+
+    if (ip_bitDepth == 0)
+    {
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_output);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        free(inputf16);
+        free(inputf16_second);
+        free(outputf16);
+        hipFree(d_inputf16);
+        hipFree(d_inputf16_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        free(inputf32);
+        free(inputf32_second);
+        free(outputf32);
+        hipFree(d_inputf32);
+        hipFree(d_inputf32_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        free(outputf16);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        free(outputf32);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        free(inputi8);
+        free(inputi8_second);
+        free(outputi8);
+        hipFree(d_inputi8);
+        hipFree(d_inputi8_second);
+        hipFree(d_outputi8);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        free(outputi8);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputi8);
+    }
+
+    return 0;
+}
diff --git a/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp
new file mode 100644
index 000000000..73082fcef
--- /dev/null
+++ b/utilities/rpp-performancetests/HIP_NEW/Tensor_hip_pln3.cpp
@@ -0,0 +1,710 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <hip/hip_fp16.h>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 7;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pln3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[6]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]);
+        printf("\ncase number (1:7) = %s", argv[5]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    int ip_bitDepth = atoi(argv[3]);
+    unsigned int outputFormatToggle = atoi(argv[4]);
+    int test_case = atoi(argv[5]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PLN3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        // outputFormatToggle = 0;
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst;
+    hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI));
+    hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert default OpenCV PKD3 to PLN3 for first input batch
+
+    Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputTemp, *inputCopyTemp;
+    inputTemp = input;
+    inputCopyTemp = inputCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputTempR, *inputTempG, *inputTempB;
+        inputTempR = inputTemp;
+        inputTempG = inputTempR + srcDescPtr->strides.cStride;
+        inputTempB = inputTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputTempR = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempR++;
+                *inputTempG = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempG++;
+                *inputTempB = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempB++;
+            }
+        }
+
+        inputTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputCopy);
+
+    // Convert default OpenCV PKD3 to PLN3 for second input batch
+
+    Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputSecondTemp, *inputSecondCopyTemp;
+    inputSecondTemp = input_second;
+    inputSecondCopyTemp = inputSecondCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB;
+        inputSecondTempR = inputSecondTemp;
+        inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride;
+        inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputSecondTempR = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempR++;
+                *inputSecondTempG = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempG++;
+                *inputSecondTempB = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempB++;
+            }
+        }
+
+        inputSecondTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputSecondCopy);
+
+    // Convert inputs to test various other bit depths and copy to hip buffers
+
+    half *inputf16, *inputf16_second, *outputf16;
+    Rpp32f *inputf32, *inputf32_second, *outputf32;
+    Rpp8s *inputi8, *inputi8_second, *outputi8;
+    int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second;
+    int *d_output, *d_outputf16, *d_outputf32, *d_outputi8;
+
+    if (ip_bitDepth == 0)
+    {
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        inputf16 = (half *)calloc(ioBufferSize, sizeof(half));
+        inputf16_second = (half *)calloc(ioBufferSize, sizeof(half));
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        half *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = (half)(((float)*inputTemp) / 255.0);
+            *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0);
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf16, ioBufferSize * sizeof(half));
+        hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+
+        hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages);
+
+    clock_t start, end;
+    double max_time_used = 0, min_time_used = 500, avg_time_used = 0;
+
+    string test_case_name;
+
+    printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages);
+
+    for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++)
+    {
+        double gpu_time_used;
+        switch (test_case)
+        {
+        case 0:
+        {
+            test_case_name = "brightness";
+
+            Rpp32f alpha[images];
+            Rpp32f beta[images];
+            for (i = 0; i < images; i++)
+            {
+                alpha[i] = 1.75;
+                beta[i] = 50;
+
+                // xywhROI override sample
+                // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+                // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+                // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+                // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+                // ltrbROI override sample
+                // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+                // roiTensorPtrSrc[i].ltrbROI.lt.y = 30;
+                // roiTensorPtrSrc[i].ltrbROI.rb.x = 210;
+                // roiTensorPtrSrc[i].ltrbROI.rb.y = 210;
+            }
+
+            // Change RpptRoiType for ltrbROI override sample
+            // roiTypeSrc = RpptRoiType::LTRB;
+            // roiTypeDst = RpptRoiType::LTRB;
+
+            hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice);
+
+            start = clock();
+
+            if (ip_bitDepth == 0)
+                rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 1)
+                rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 2)
+                rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 3)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 4)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 5)
+                rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 6)
+                missingFuncFlag = 1;
+            else
+                missingFuncFlag = 1;
+
+            end = clock();
+
+            break;
+        }
+        default:
+            missingFuncFlag = 1;
+            break;
+        }
+
+        if (missingFuncFlag == 1)
+        {
+            printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+            return -1;
+        }
+
+        // Display measured times
+
+        gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+        if (gpu_time_used > max_time_used)
+            max_time_used = gpu_time_used;
+        if (gpu_time_used < min_time_used)
+            min_time_used = gpu_time_used;
+        avg_time_used += gpu_time_used;
+    }
+
+    avg_time_used /= 100;
+    cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl;
+
+    rppDestroyGPU(handle);
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    hipFree(d_roiTensorPtrSrc);
+    hipFree(d_roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+
+    if (ip_bitDepth == 0)
+    {
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_output);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        free(inputf16);
+        free(inputf16_second);
+        free(outputf16);
+        hipFree(d_inputf16);
+        hipFree(d_inputf16_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        free(inputf32);
+        free(inputf32_second);
+        free(outputf32);
+        hipFree(d_inputf32);
+        hipFree(d_inputf32_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        free(outputf16);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        free(outputf32);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        free(inputi8);
+        free(inputi8_second);
+        free(outputi8);
+        hipFree(d_inputi8);
+        hipFree(d_inputi8_second);
+        hipFree(d_outputi8);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        free(outputi8);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputi8);
+    }
+
+    return 0;
+}
diff --git a/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py b/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py
index 698de4166..4026a0e7c 100644
--- a/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py
+++ b/utilities/rpp-performancetests/HIP_NEW/generatePerformanceLogs.py
@@ -31,7 +31,10 @@
     log_file_list = [
         "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pkd3_hip_raw_performance_log.txt",
         "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pln3_hip_raw_performance_log.txt",
-        "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pln1_hip_raw_performance_log.txt"
+        "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/BatchPD_hip_pln1_hip_raw_performance_log.txt",
+        "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/Tensor_hip_pkd3_hip_raw_performance_log.txt",
+        "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/Tensor_hip_pln3_hip_raw_performance_log.txt",
+        "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW/Tensor_hip_pln1_hip_raw_performance_log.txt"
         ]
 
     functionality_group_list = [
@@ -137,15 +140,20 @@ def func_group_finder(case_number):
 
     RESULTS_DIR = "../OUTPUT_PERFORMANCE_LOGS_HIP_NEW"
     print("RESULTS_DIR = " + RESULTS_DIR)
-    CONSOLIDATED_FILE_PKD3 = RESULTS_DIR + "/consolidated_results_pkd3.stats.csv"
-    CONSOLIDATED_FILE_PLN1 = RESULTS_DIR + "/consolidated_results_pln1.stats.csv"
-    CONSOLIDATED_FILE_PLN3 = RESULTS_DIR + "/consolidated_results_pln3.stats.csv"
-
-    TYPE_LIST = ["PKD3", "PLN1", "PLN3"]
+    CONSOLIDATED_FILE_BATCHPD_PKD3 = RESULTS_DIR + "/consolidated_results_BatchPD_PKD3.stats.csv"
+    CONSOLIDATED_FILE_BATCHPD_PLN1 = RESULTS_DIR + "/consolidated_results_BatchPD_PLN1.stats.csv"
+    CONSOLIDATED_FILE_BATCHPD_PLN3 = RESULTS_DIR + "/consolidated_results_BatchPD_PLN3.stats.csv"
+    CONSOLIDATED_FILE_TENSOR_PKD3 = RESULTS_DIR + "/consolidated_results_Tensor_PKD3.stats.csv"
+    CONSOLIDATED_FILE_TENSOR_PLN1 = RESULTS_DIR + "/consolidated_results_Tensor_PLN1.stats.csv"
+    CONSOLIDATED_FILE_TENSOR_PLN3 = RESULTS_DIR + "/consolidated_results_Tensor_PLN3.stats.csv"
+
+    TYPE_LIST = ["BatchPD_PKD3", "BatchPD_PLN1", "BatchPD_PLN3", "Tensor_PKD3", "Tensor_PLN1", "Tensor_PLN3"]
+    BATCHPD_TYPE_LIST = ["BatchPD_PKD3", "BatchPD_PLN1", "BatchPD_PLN3"]
+    TENSOR_TYPE_LIST = ["Tensor_PKD3", "Tensor_PLN1", "Tensor_PLN3"]
     CASE_NUM_LIST = range(int(caseStart), int(caseEnd) + 1, 1)
     BIT_DEPTH_LIST = range(0, 7, 1)
     OFT_LIST = range(0, 2, 1)
-    d_counter = {"PKD3":0, "PLN1":0, "PLN3":0}
+    d_counter = {"BatchPD_PKD3":0, "BatchPD_PLN1":0, "BatchPD_PLN3":0, "Tensor_PKD3":0, "Tensor_PLN1":0, "Tensor_PLN3":0}
 
     for TYPE in TYPE_LIST:
 
@@ -161,9 +169,9 @@ def func_group_finder(case_number):
             # Add functionality group header
             if CASE_NUM in NEW_FUNC_GROUP_LIST:
                 FUNC_GROUP = func_group_finder(CASE_NUM)
-                new_file.write(" ,0,0,0,0\n")
+                new_file.write("0,0,0,0,0\n")
                 new_file.write(FUNC_GROUP + ",0,0,0,0\n")
-                new_file.write(" ,0,0,0,0\n")
+                new_file.write("0,0,0,0,0\n")
 
             # Set results directory
             CASE_RESULTS_DIR = RESULTS_DIR + "/" + TYPE + "/case_" + str(CASE_NUM)
@@ -183,10 +191,14 @@ def func_group_finder(case_number):
                         for line in case_file:
                             print(line)
                             if not(line.startswith('"Name"')):
-                                if prev != line.split(",")[0]:
+                                if TYPE in TENSOR_TYPE_LIST:
                                     new_file.write(line)
-                                    prev = line.split(",")[0]
                                     d_counter[TYPE] = d_counter[TYPE] + 1
+                                elif TYPE in BATCHPD_TYPE_LIST:
+                                    if prev != line.split(",")[0]:
+                                        new_file.write(line)
+                                        prev = line.split(",")[0]
+                                        d_counter[TYPE] = d_counter[TYPE] + 1
                         case_file.close()
                     except IOError:
                         print("Unable to open case results")
@@ -212,7 +224,13 @@ def func_group_finder(case_number):
             print(dfPrint_noIndices)
 
     except ImportError:
-        print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + CONSOLIDATED_FILE_PKD3 + "\n" + CONSOLIDATED_FILE_PLN1 + "\n" + CONSOLIDATED_FILE_PLN3 + "\n")
+        print("\nPandas not available! Results of GPU profiling experiment are available in the following files:\n" + \
+            CONSOLIDATED_FILE_BATCHPD_PKD3 + "\n" + \
+                CONSOLIDATED_FILE_BATCHPD_PLN1 + "\n" + \
+                    CONSOLIDATED_FILE_BATCHPD_PLN3 + "\n" + \
+                        CONSOLIDATED_FILE_TENSOR_PKD3 + "\n" + \
+                            CONSOLIDATED_FILE_TENSOR_PLN1 + "\n" + \
+                                CONSOLIDATED_FILE_TENSOR_PLN3 + "\n")
 
     except IOError:
         print("Unable to open results in " + RESULTS_DIR + "/consolidated_results_" + TYPE + ".stats.csv")
diff --git a/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh b/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh
index bcfe0e0e6..2b0f9f412 100755
--- a/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh
+++ b/utilities/rpp-performancetests/HIP_NEW/rawLogsGenScript.sh
@@ -134,9 +134,12 @@ make -j16
 
 if [[ "$PROFILING_OPTION" -eq 1 ]]
 then
-    mkdir "$DST_FOLDER/PKD3"
-    mkdir "$DST_FOLDER/PLN1"
-    mkdir "$DST_FOLDER/PLN3"
+    mkdir "$DST_FOLDER/BatchPD_PKD3"
+    mkdir "$DST_FOLDER/BatchPD_PLN1"
+    mkdir "$DST_FOLDER/BatchPD_PLN3"
+    mkdir "$DST_FOLDER/Tensor_PKD3"
+    mkdir "$DST_FOLDER/Tensor_PLN1"
+    mkdir "$DST_FOLDER/Tensor_PLN3"
 fi
 
 printf "\n\n\n\n\n"
@@ -170,10 +173,22 @@ do
                 ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pkd3_hip_raw_performance_log.txt"
             elif [[ "$PROFILING_OPTION" -eq 1 ]]
             then
-                mkdir "$DST_FOLDER/PKD3/case_$case"
-                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/PKD3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
-                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/PKD3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pkd3_hip_raw_performance_log.txt"
+                mkdir "$DST_FOLDER/BatchPD_PKD3/case_$case"
+                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/BatchPD_PKD3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/BatchPD_PKD3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pkd3_hip_raw_performance_log.txt"
             fi
+
+            if [[ "$PROFILING_OPTION" -eq 0 ]]
+            then
+                printf "\n./Tensor_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                ./Tensor_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pkd3_hip_raw_performance_log.txt"
+            elif [[ "$PROFILING_OPTION" -eq 1 ]]
+            then
+                mkdir "$DST_FOLDER/Tensor_PKD3/case_$case"
+                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/Tensor_PKD3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./Tensor_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/Tensor_PKD3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./Tensor_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pkd3_hip_raw_performance_log.txt"
+            fi
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -214,10 +229,22 @@ do
                 ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln1_hip_raw_performance_log.txt"
             elif [[ "$PROFILING_OPTION" -eq 1 ]]
             then
-                mkdir "$DST_FOLDER/PLN1/case_$case"
-                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/PLN1/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
-                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/PLN1/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln1_hip_raw_performance_log.txt"
+                mkdir "$DST_FOLDER/BatchPD_PLN1/case_$case"
+                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/BatchPD_PLN1/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/BatchPD_PLN1/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln1_hip_raw_performance_log.txt"
             fi
+
+            if [[ "$PROFILING_OPTION" -eq 0 ]]
+            then
+                printf "\n./Tensor_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                ./Tensor_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln1_hip_raw_performance_log.txt"
+            elif [[ "$PROFILING_OPTION" -eq 1 ]]
+            then
+                mkdir "$DST_FOLDER/Tensor_PLN1/case_$case"
+                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/Tensor_PLN1/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./Tensor_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/Tensor_PLN1/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./Tensor_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln1_hip_raw_performance_log.txt"
+            fi
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -258,10 +285,22 @@ do
                 ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln3_hip_raw_performance_log.txt"
             elif [[ "$PROFILING_OPTION" -eq 1 ]]
             then
-                mkdir "$DST_FOLDER/PLN3/case_$case"
-                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/PLN3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
-                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/PLN3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln3_hip_raw_performance_log.txt"
+                mkdir "$DST_FOLDER/BatchPD_PLN3/case_$case"
+                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/BatchPD_PLN3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./BatchPD_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/BatchPD_PLN3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_hip_pln3_hip_raw_performance_log.txt"
+            fi
+
+            if [[ "$PROFILING_OPTION" -eq 0 ]]
+            then
+                printf "\n./Tensor_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                ./Tensor_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln3_hip_raw_performance_log.txt"
+            elif [[ "$PROFILING_OPTION" -eq 1 ]]
+            then
+                mkdir "$DST_FOLDER/Tensor_PLN3/case_$case"
+                printf "\nrocprof --basenames on --timestamp on --stats -o $DST_FOLDER/Tensor_PLN3/case_$case/output_case$case" "_bitDepth$bitDepth" "_oft$outputFormatToggle.csv" "./Tensor_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+                rocprof --basenames on --timestamp on --stats -o "$DST_FOLDER/Tensor_PLN3/case_$case""/output_case""$case""_bitDepth""$bitDepth""_oft""$outputFormatToggle"".csv" ./Tensor_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_hip_pln3_hip_raw_performance_log.txt"
             fi
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
diff --git a/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt b/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt
index 15d7f51ca..94b2656c7 100644
--- a/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt
+++ b/utilities/rpp-performancetests/HOST_NEW/CMakeLists.txt
@@ -7,7 +7,7 @@ find_package(OpenCV REQUIRED)
 find_package(AMDRPP QUIET)
 
 if(NOT OpenCL_FOUND)
-	message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}")
+    message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}")
 endif()
 
 if (OpenCL_FOUND)
@@ -19,12 +19,16 @@ if (OpenCL_FOUND)
     add_executable(BatchPD_host_pkd3 BatchPD_host_pkd3.cpp)
     add_executable(BatchPD_host_pln1 BatchPD_host_pln1.cpp)
     add_executable(BatchPD_host_pln3 BatchPD_host_pln3.cpp)
+    add_executable(Tensor_host_pkd3 Tensor_host_pkd3.cpp)
+    add_executable(Tensor_host_pln1 Tensor_host_pln1.cpp)
+    add_executable(Tensor_host_pln3 Tensor_host_pln3.cpp)
     # add_executable(Single_host Single_host.cpp)
-    add_executable(uniqueFunctionalities_host uniqueFunctionalities_host.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DOCL_COMPILE=1 -DRPP_BACKEND_OPENCL=1 -std=c++11")
     target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
     target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
     target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
+    target_link_libraries(Tensor_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
+    target_link_libraries(Tensor_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
+    target_link_libraries(Tensor_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
     # target_link_libraries(Single_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system )
-    target_link_libraries(uniqueFunctionalities_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
 endif()
\ No newline at end of file
diff --git a/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp
new file mode 100644
index 000000000..29b627755
--- /dev/null
+++ b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pkd3.cpp
@@ -0,0 +1,519 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <half.hpp>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+using half_float::half;
+
+typedef half Rpp16f;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 7;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pkd3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[6]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]);
+        printf("\ncase number (1:7) = %s", argv[5]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    int ip_bitDepth = atoi(argv[3]);
+    unsigned int outputFormatToggle = atoi(argv[4]);
+    int test_case = atoi(argv[5]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PKD3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+    srcDescPtr->c = ip_channel;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+    dstDescPtr->c = ip_channel;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w;
+    srcDescPtr->strides.wStride = ip_channel;
+    srcDescPtr->strides.cStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+
+    Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+
+    Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += srcDescPtr->strides.hStride;
+            input_second_temp += srcDescPtr->strides.hStride;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths
+
+    if (ip_bitDepth == 1)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp16f *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0;
+            *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 2)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 5)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, noOfImages);
+
+    double max_time_used = 0, min_time_used = 500, avg_time_used = 0;
+
+    string test_case_name;
+
+    printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages);
+
+    for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++)
+    {
+        clock_t start, end;
+        double start_omp, end_omp;
+        double cpu_time_used, omp_time_used;
+        switch (test_case)
+        {
+        case 0:
+        {
+            test_case_name = "brightness";
+
+            Rpp32f alpha[images];
+            Rpp32f beta[images];
+            for (i = 0; i < images; i++)
+            {
+                alpha[i] = 1.75;
+                beta[i] = 50;
+
+                // xywhROI override sample
+                // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+                // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+                // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+                // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+                // ltrbROI override sample
+                // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+                // roiTensorPtrSrc[i].ltrbROI.lt.y = 50;
+                // roiTensorPtrSrc[i].ltrbROI.rb.x = 199;
+                // roiTensorPtrSrc[i].ltrbROI.rb.y = 149;
+            }
+
+            // Change RpptRoiType for ltrbROI override sample
+            // roiTypeSrc = RpptRoiType::LTRB;
+            // roiTypeDst = RpptRoiType::LTRB;
+
+            start_omp = omp_get_wtime();
+            start = clock();
+            if (ip_bitDepth == 0)
+                rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 1)
+                rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 2)
+                rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 3)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 4)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 5)
+                rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 6)
+                missingFuncFlag = 1;
+            else
+                missingFuncFlag = 1;
+            end = clock();
+            end_omp = omp_get_wtime();
+
+            break;
+        }
+        default:
+            missingFuncFlag = 1;
+            break;
+        }
+
+        if (missingFuncFlag == 1)
+        {
+            printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+            return -1;
+        }
+
+        cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+        omp_time_used = end_omp - start_omp;
+        if (cpu_time_used > max_time_used)
+            max_time_used = cpu_time_used;
+        if (cpu_time_used < min_time_used)
+            min_time_used = cpu_time_used;
+        avg_time_used += cpu_time_used;
+    }
+
+    avg_time_used /= 100;
+
+    // Display measured times
+
+    cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl;
+
+    rppDestroyHost(handle);
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+    free(inputf16);
+    free(inputf16_second);
+    free(outputf16);
+    free(inputf32);
+    free(inputf32_second);
+    free(outputf32);
+    free(inputi8);
+    free(inputi8_second);
+    free(outputi8);
+
+    return 0;
+}
diff --git a/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp
new file mode 100644
index 000000000..cb5c9801d
--- /dev/null
+++ b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln1.cpp
@@ -0,0 +1,517 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <half.hpp>
+#include <fstream>
+#include "helpers/testSuite_helper.hpp"
+
+using namespace cv;
+using namespace std;
+using half_float::half;
+
+typedef half Rpp16f;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 7;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pln1 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+    if (atoi(argv[4]) != 0)
+    {
+        printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n");
+        return -1;
+    }
+
+    if (atoi(argv[6]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]);
+        printf("\ncase number (1:7) = %s", argv[5]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    int ip_bitDepth = atoi(argv[3]);
+    unsigned int outputFormatToggle = atoi(argv[4]);
+    int test_case = atoi(argv[5]);
+
+    int ip_channel = 1;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PLN1_toPLN1"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    srcDescPtr->layout = RpptLayout::NCHW;
+    dstDescPtr->layout = RpptLayout::NCHW;
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 0);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+
+    Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+
+    Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 0);
+        image_second = imread(temp_second, 0);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths
+
+    if (ip_bitDepth == 1)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp16f *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0;
+            *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 2)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 5)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, noOfImages);
+
+    double max_time_used = 0, min_time_used = 500, avg_time_used = 0;
+
+    string test_case_name;
+
+    printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages);
+
+    for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++)
+    {
+        clock_t start, end;
+        double start_omp, end_omp;
+        double cpu_time_used, omp_time_used;
+        switch (test_case)
+        {
+        case 0:
+        {
+            test_case_name = "brightness";
+
+            Rpp32f alpha[images];
+            Rpp32f beta[images];
+            for (i = 0; i < images; i++)
+            {
+                alpha[i] = 1.75;
+                beta[i] = 50;
+
+                // xywhROI override sample
+                // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+                // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+                // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+                // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+                // ltrbROI override sample
+                // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+                // roiTensorPtrSrc[i].ltrbROI.lt.y = 50;
+                // roiTensorPtrSrc[i].ltrbROI.rb.x = 199;
+                // roiTensorPtrSrc[i].ltrbROI.rb.y = 149;
+            }
+
+            // Change RpptRoiType for ltrbROI override sample
+            // roiTypeSrc = RpptRoiType::LTRB;
+            // roiTypeDst = RpptRoiType::LTRB;
+
+            start_omp = omp_get_wtime();
+            start = clock();
+            if (ip_bitDepth == 0)
+                rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 1)
+                rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 2)
+                rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 3)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 4)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 5)
+                rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 6)
+                missingFuncFlag = 1;
+            else
+                missingFuncFlag = 1;
+            end = clock();
+            end_omp = omp_get_wtime();
+
+            break;
+        }
+        default:
+            missingFuncFlag = 1;
+            break;
+        }
+
+        if (missingFuncFlag == 1)
+        {
+            printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+            return -1;
+        }
+
+        cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+        omp_time_used = end_omp - start_omp;
+        if (cpu_time_used > max_time_used)
+            max_time_used = cpu_time_used;
+        if (cpu_time_used < min_time_used)
+            min_time_used = cpu_time_used;
+        avg_time_used += cpu_time_used;
+    }
+
+    avg_time_used /= 100;
+
+    // Display measured times
+
+    cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl;
+
+    rppDestroyHost(handle);
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+    free(inputf16);
+    free(inputf16_second);
+    free(outputf16);
+    free(inputf32);
+    free(inputf32_second);
+    free(outputf32);
+    free(inputi8);
+    free(inputi8_second);
+    free(outputi8);
+
+    return 0;
+}
diff --git a/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp
new file mode 100644
index 000000000..3bdd5fdd0
--- /dev/null
+++ b/utilities/rpp-performancetests/HOST_NEW/Tensor_host_pln3.cpp
@@ -0,0 +1,595 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <half.hpp>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+using half_float::half;
+
+typedef half Rpp16f;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 7;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pln3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[6]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[3]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[4]);
+        printf("\ncase number (1:7) = %s", argv[5]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    int ip_bitDepth = atoi(argv[3]);
+    unsigned int outputFormatToggle = atoi(argv[4]);
+    int test_case = atoi(argv[5]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PLN3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+
+    Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+
+    Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert default OpenCV PKD3 to PLN3 for first input batch
+
+    Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputTemp, *inputCopyTemp;
+    inputTemp = input;
+    inputCopyTemp = inputCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputTempR, *inputTempG, *inputTempB;
+        inputTempR = inputTemp;
+        inputTempG = inputTempR + srcDescPtr->strides.cStride;
+        inputTempB = inputTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputTempR = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempR++;
+                *inputTempG = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempG++;
+                *inputTempB = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempB++;
+            }
+        }
+
+        inputTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputCopy);
+
+    // Convert default OpenCV PKD3 to PLN3 for second input batch
+
+    Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputSecondTemp, *inputSecondCopyTemp;
+    inputSecondTemp = input_second;
+    inputSecondCopyTemp = inputSecondCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB;
+        inputSecondTempR = inputSecondTemp;
+        inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride;
+        inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputSecondTempR = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempR++;
+                *inputSecondTempG = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempG++;
+                *inputSecondTempB = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempB++;
+            }
+        }
+
+        inputSecondTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputSecondCopy);
+
+    // Convert inputs to test various other bit depths
+
+    if (ip_bitDepth == 1)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp16f *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0;
+            *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 2)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 5)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, noOfImages);
+
+    double max_time_used = 0, min_time_used = 500, avg_time_used = 0;
+
+    string test_case_name;
+
+    printf("\nRunning %s 100 times (each time with a batch size of %d images) and computing mean statistics...", func, noOfImages);
+
+    for (int perfRunCount = 0; perfRunCount < 100; perfRunCount++)
+    {
+        clock_t start, end;
+        double start_omp, end_omp;
+        double cpu_time_used, omp_time_used;
+        switch (test_case)
+        {
+        case 0:
+        {
+            test_case_name = "brightness";
+
+            Rpp32f alpha[images];
+            Rpp32f beta[images];
+            for (i = 0; i < images; i++)
+            {
+                alpha[i] = 1.75;
+                beta[i] = 50;
+
+                // xywhROI override sample
+                // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+                // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+                // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+                // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+                // ltrbROI override sample
+                // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+                // roiTensorPtrSrc[i].ltrbROI.lt.y = 50;
+                // roiTensorPtrSrc[i].ltrbROI.rb.x = 199;
+                // roiTensorPtrSrc[i].ltrbROI.rb.y = 149;
+            }
+
+            // Change RpptRoiType for ltrbROI override sample
+            // roiTypeSrc = RpptRoiType::LTRB;
+            // roiTypeDst = RpptRoiType::LTRB;
+
+            start_omp = omp_get_wtime();
+            start = clock();
+            if (ip_bitDepth == 0)
+                rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 1)
+                rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 2)
+                rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 3)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 4)
+                missingFuncFlag = 1;
+            else if (ip_bitDepth == 5)
+                rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+            else if (ip_bitDepth == 6)
+                missingFuncFlag = 1;
+            else
+                missingFuncFlag = 1;
+            end = clock();
+            end_omp = omp_get_wtime();
+
+            break;
+        }
+        default:
+            missingFuncFlag = 1;
+            break;
+        }
+
+        if (missingFuncFlag == 1)
+        {
+            printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+            return -1;
+        }
+
+        cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+        omp_time_used = end_omp - start_omp;
+        if (cpu_time_used > max_time_used)
+            max_time_used = cpu_time_used;
+        if (cpu_time_used < min_time_used)
+            min_time_used = cpu_time_used;
+        avg_time_used += cpu_time_used;
+    }
+
+    avg_time_used /= 100;
+
+    // Display measured times
+
+    cout << fixed << "\nmax,min,avg = " << max_time_used << "," << min_time_used << "," << avg_time_used << endl;
+
+    rppDestroyHost(handle);
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+    free(inputf16);
+    free(inputf16_second);
+    free(outputf16);
+    free(inputf32);
+    free(inputf32_second);
+    free(outputf32);
+    free(inputi8);
+    free(inputi8_second);
+    free(outputi8);
+
+    return 0;
+}
diff --git a/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py b/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py
index 423b13320..2e47cc255 100644
--- a/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py
+++ b/utilities/rpp-performancetests/HOST_NEW/generatePerformanceLogs.py
@@ -27,7 +27,10 @@
 log_file_list = [
     "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pkd3_host_raw_performance_log.txt",
     "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pln3_host_raw_performance_log.txt",
-    "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pln1_host_raw_performance_log.txt"
+    "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/BatchPD_host_pln1_host_raw_performance_log.txt",
+    "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/Tensor_host_pkd3_host_raw_performance_log.txt",
+    "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/Tensor_host_pln3_host_raw_performance_log.txt",
+    "../OUTPUT_PERFORMANCE_LOGS_HOST_NEW/Tensor_host_pln1_host_raw_performance_log.txt"
     ]
 
 functionality_group_list = [
diff --git a/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh b/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh
index f46cfe52b..1763873ae 100755
--- a/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh
+++ b/utilities/rpp-performancetests/HOST_NEW/rawLogsGenScript.sh
@@ -152,6 +152,10 @@ do
 
             printf "\n./BatchPD_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_host_pkd3_host_raw_performance_log.txt"
+
+            printf "\n./Tensor_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_host_pkd3_host_raw_performance_log.txt"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -188,6 +192,10 @@ do
 
             printf "\n./BatchPD_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_host_pln1_host_raw_performance_log.txt"
+
+            printf "\n./Tensor_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_host_pln1_host_raw_performance_log.txt"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -224,6 +232,10 @@ do
 
             printf "\n./BatchPD_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/BatchPD_host_pln3_host_raw_performance_log.txt"
+
+            printf "\n./Tensor_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0" | tee -a "$DST_FOLDER/Tensor_host_pln3_host_raw_performance_log.txt"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
diff --git a/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt b/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt
index bd5254b85..1683839b5 100644
--- a/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt
+++ b/utilities/rpp-unittests/HIP_NEW/CMakeLists.txt
@@ -23,12 +23,18 @@ if (hip_FOUND)
     add_executable(BatchPD_hip_pkd3 BatchPD_hip_pkd3.cpp)
     add_executable(BatchPD_hip_pln1 BatchPD_hip_pln1.cpp)
     add_executable(BatchPD_hip_pln3 BatchPD_hip_pln3.cpp)
+    add_executable(Tensor_hip_pkd3 Tensor_hip_pkd3.cpp)
+    add_executable(Tensor_hip_pln3 Tensor_hip_pln3.cpp)
+    add_executable(Tensor_hip_pln1 Tensor_hip_pln1.cpp)
     # add_executable(Single_hip Single_hip.cpp)
     add_executable(uniqueFunctionalities_hip uniqueFunctionalities_hip.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DHIP_COMPILE=1 -DRPP_BACKEND_HIP=1 -std=c++11")
     target_link_libraries(BatchPD_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     target_link_libraries(BatchPD_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     target_link_libraries(BatchPD_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
+    target_link_libraries(Tensor_hip_pkd3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
+    target_link_libraries(Tensor_hip_pln1 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
+    target_link_libraries(Tensor_hip_pln3 ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     # target_link_libraries(Single_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
     target_link_libraries(uniqueFunctionalities_hip ${OpenCV_LIBS} -lamd_rpp ${HIP_LIBRARIES} pthread boost_filesystem boost_system hip::host)
 endif()
\ No newline at end of file
diff --git a/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp
new file mode 100644
index 000000000..0bd00b636
--- /dev/null
+++ b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pkd3.cpp
@@ -0,0 +1,831 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <hip/hip_fp16.h>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 8;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_hip_pkd3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <dst folder> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[7]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\ndst = %s", argv[3]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
+        printf("\ncase number (1:7) = %s", argv[6]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    char *dst = argv[3];
+    int ip_bitDepth = atoi(argv[4]);
+    unsigned int outputFormatToggle = atoi(argv[5]);
+    int test_case = atoi(argv[6]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HIP_PKD3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        // outputFormatToggle = 0;
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+    printf("\nRunning %s...", func);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+    strcat(dst, "/");
+    strcat(dst, funcName);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst;
+    hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI));
+    hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+    srcDescPtr->c = ip_channel;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+    dstDescPtr->c = ip_channel;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w;
+    srcDescPtr->strides.wStride = ip_channel;
+    srcDescPtr->strides.cStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize 8u host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += srcDescPtr->strides.hStride;
+            input_second_temp += srcDescPtr->strides.hStride;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths and copy to hip buffers
+
+    half *inputf16, *inputf16_second, *outputf16;
+    Rpp32f *inputf32, *inputf32_second, *outputf32;
+    Rpp8s *inputi8, *inputi8_second, *outputi8;
+    int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second;
+    int *d_output, *d_outputf16, *d_outputf32, *d_outputi8;
+
+    if (ip_bitDepth == 0)
+    {
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        inputf16 = (half *)calloc(ioBufferSize, sizeof(half));
+        inputf16_second = (half *)calloc(ioBufferSize, sizeof(half));
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        half *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = (half)(((float)*inputTemp) / 255.0);
+            *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0);
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf16, ioBufferSize * sizeof(half));
+        hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+
+        hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages);
+
+    clock_t start, end;
+    double gpu_time_used;
+
+    string test_case_name;
+
+    switch (test_case)
+    {
+    case 0:
+    {
+        test_case_name = "brightness";
+
+        Rpp32f alpha[images];
+        Rpp32f beta[images];
+        for (i = 0; i < images; i++)
+        {
+            alpha[i] = 1.75;
+            beta[i] = 50;
+
+            // xywhROI override sample
+            // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+            // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+            // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+            // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+            // ltrbROI override sample
+            // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+            // roiTensorPtrSrc[i].ltrbROI.lt.y = 30;
+            // roiTensorPtrSrc[i].ltrbROI.rb.x = 210;
+            // roiTensorPtrSrc[i].ltrbROI.rb.y = 210;
+        }
+
+        // Change RpptRoiType for ltrbROI override sample
+        // roiTypeSrc = RpptRoiType::LTRB;
+        // roiTypeDst = RpptRoiType::LTRB;
+
+
+        hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice);
+
+        start = clock();
+
+        if (ip_bitDepth == 0)
+            rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 1)
+            rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 2)
+            rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 3)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 4)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 5)
+            rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 6)
+            missingFuncFlag = 1;
+        else
+            missingFuncFlag = 1;
+
+        end = clock();
+
+        break;
+    }
+    default:
+        missingFuncFlag = 1;
+        break;
+    }
+
+    if (missingFuncFlag == 1)
+    {
+        printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+        return -1;
+    }
+
+    // Display measured times
+
+    gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+    cout << "\nGPU Time - BatchPD : " << gpu_time_used;
+    printf("\n");
+
+    // Reconvert other bit depths to 8u for output display purposes
+
+    string fileName = std::to_string(ip_bitDepth);
+    ofstream outputFile (fileName + ".csv");
+
+    if (ip_bitDepth == 0)
+    {
+        hipMemcpy(output, d_output, oBufferSize * sizeof(Rpp8u), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32u) *outputTemp << ",";
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 1) || (ip_bitDepth == 3))
+    {
+        hipMemcpy(outputf16, d_outputf16, oBufferSize * sizeof(half), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        half *outputf16Temp;
+        outputf16Temp = outputf16;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (char) *outputf16Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK((float)*outputf16Temp * 255.0);
+                outputf16Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 2) || (ip_bitDepth == 4))
+    {
+        hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp32f *outputf32Temp;
+        outputf32Temp = outputf32;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf32Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0);
+                outputf32Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 5) || (ip_bitDepth == 6))
+    {
+        hipMemcpy(outputi8, d_outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp8s *outputi8Temp;
+        outputi8Temp = outputi8;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32s) *outputi8Temp << ",";
+                *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128);
+                outputi8Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+
+    // Calculate exact dstROI in XYWH format for OpenCV dump
+
+    if (roiTypeSrc == RpptRoiType::LTRB)
+    {
+        for (int i = 0; i < dstDescPtr->n; i++)
+        {
+            int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x;
+            int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y;
+            int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x;
+            int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y;
+
+            roiTensorPtrSrc[i].xywhROI.xy.x = ltX;
+            roiTensorPtrSrc[i].xywhROI.xy.y = ltY;
+            roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1;
+            roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1;
+        }
+    }
+
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h;
+
+    for (int i = 0; i < dstDescPtr->n; i++)
+    {
+        roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth);
+        roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight);
+        roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x);
+        roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y);
+    }
+
+    // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump
+
+    if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+        memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u));
+
+        Rpp8u *outputTemp, *outputCopyTemp;
+        outputTemp = output;
+        outputCopyTemp = outputCopy;
+
+        for (int count = 0; count < dstDescPtr->n; count++)
+        {
+            Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB;
+            outputCopyTempR = outputCopyTemp;
+            outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride;
+            outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride;
+
+            for (int i = 0; i < dstDescPtr->h; i++)
+            {
+                for (int j = 0; j < dstDescPtr->w; j++)
+                {
+                    *outputTemp = *outputCopyTempR;
+                    outputTemp++;
+                    outputCopyTempR++;
+                    *outputTemp = *outputCopyTempG;
+                    outputTemp++;
+                    outputCopyTempG++;
+                    *outputTemp = *outputCopyTempB;
+                    outputTemp++;
+                    outputCopyTempB++;
+                }
+            }
+
+            outputCopyTemp += dstDescPtr->strides.nStride;
+        }
+
+        free(outputCopy);
+    }
+
+    rppDestroyGPU(handle);
+
+    // OpenCV dump
+
+    mkdir(dst, 0700);
+    strcat(dst, "/");
+    count = 0;
+
+    for (j = 0; j < dstDescPtr->n; j++)
+    {
+        int height = roiTensorPtrSrc[j].xywhROI.roiHeight;
+        int width = roiTensorPtrSrc[j].xywhROI.roiWidth;
+
+        int op_size = height * width * ip_channel;
+        Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u));
+        Rpp8u *temp_output_row;
+        temp_output_row = temp_output;
+        Rpp32u elementsInRow = width * ip_channel;
+        Rpp8u *output_row = output + count;
+
+        for (int k = 0; k < height; k++)
+        {
+            memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u));
+            temp_output_row += elementsInRow;
+            output_row += srcDescPtr->strides.hStride;
+        }
+        count += dstDescPtr->strides.nStride;
+
+        char temp[1000];
+        strcpy(temp, dst);
+        strcat(temp, imageNames[j]);
+
+        Mat mat_op_image;
+        mat_op_image = Mat(height, width, CV_8UC3, temp_output);
+        imwrite(temp, mat_op_image);
+
+        free(temp_output);
+    }
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    hipFree(d_roiTensorPtrSrc);
+    hipFree(d_roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+
+    if (ip_bitDepth == 0)
+    {
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_output);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        free(inputf16);
+        free(inputf16_second);
+        free(outputf16);
+        hipFree(d_inputf16);
+        hipFree(d_inputf16_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        free(inputf32);
+        free(inputf32_second);
+        free(outputf32);
+        hipFree(d_inputf32);
+        hipFree(d_inputf32_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        free(outputf16);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        free(outputf32);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        free(inputi8);
+        free(inputi8_second);
+        free(outputi8);
+        hipFree(d_inputi8);
+        hipFree(d_inputi8_second);
+        hipFree(d_outputi8);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        free(outputi8);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputi8);
+    }
+
+    return 0;
+}
diff --git a/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp
new file mode 100644
index 000000000..b763a62bf
--- /dev/null
+++ b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln1.cpp
@@ -0,0 +1,789 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <hip/hip_fp16.h>
+#include <fstream>
+#include "helpers/testSuite_helper.hpp"
+
+using namespace cv;
+using namespace std;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 8;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_hip_pln1 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <dst folder> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+    if (atoi(argv[5]) != 0)
+    {
+        printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n");
+        return -1;
+    }
+
+    if (atoi(argv[7]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\ndst = %s", argv[3]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
+        printf("\ncase number (1:7) = %s", argv[6]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    char *dst = argv[3];
+    int ip_bitDepth = atoi(argv[4]);
+    unsigned int outputFormatToggle = atoi(argv[5]);
+    int test_case = atoi(argv[6]);
+
+    int ip_channel = 1;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HIP_PLN1_toPLN1"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        outputFormatToggle = 0;
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    srcDescPtr->layout = RpptLayout::NCHW;
+    dstDescPtr->layout = RpptLayout::NCHW;
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+    printf("\nRunning %s...", func);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+    strcat(dst, "/");
+    strcat(dst, funcName);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst;
+    hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI));
+    hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 0);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 0);
+        image_second = imread(temp_second, 0);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths and copy to hip buffers
+
+    half *inputf16, *inputf16_second, *outputf16;
+    Rpp32f *inputf32, *inputf32_second, *outputf32;
+    Rpp8s *inputi8, *inputi8_second, *outputi8;
+    int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second;
+    int *d_output, *d_outputf16, *d_outputf32, *d_outputi8;
+
+    if (ip_bitDepth == 0)
+    {
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        inputf16 = (half *)calloc(ioBufferSize, sizeof(half));
+        inputf16_second = (half *)calloc(ioBufferSize, sizeof(half));
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        half *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = (half)(((float)*inputTemp) / 255.0);
+            *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0);
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf16, ioBufferSize * sizeof(half));
+        hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+
+        hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages);
+
+    clock_t start, end;
+    double gpu_time_used;
+
+    string test_case_name;
+
+    switch (test_case)
+    {
+    case 0:
+    {
+        test_case_name = "brightness";
+
+        Rpp32f alpha[images];
+        Rpp32f beta[images];
+        for (i = 0; i < images; i++)
+        {
+            alpha[i] = 1.75;
+            beta[i] = 50;
+
+            // xywhROI override sample
+            // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+            // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+            // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+            // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+            // ltrbROI override sample
+            // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+            // roiTensorPtrSrc[i].ltrbROI.lt.y = 30;
+            // roiTensorPtrSrc[i].ltrbROI.rb.x = 210;
+            // roiTensorPtrSrc[i].ltrbROI.rb.y = 210;
+        }
+
+        // Change RpptRoiType for ltrbROI override sample
+        // roiTypeSrc = RpptRoiType::LTRB;
+        // roiTypeDst = RpptRoiType::LTRB;
+
+        hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice);
+
+        start = clock();
+
+        if (ip_bitDepth == 0)
+            rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 1)
+            rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 2)
+            rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 3)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 4)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 5)
+            rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 6)
+            missingFuncFlag = 1;
+        else
+            missingFuncFlag = 1;
+
+        end = clock();
+
+        break;
+    }
+    default:
+        missingFuncFlag = 1;
+        break;
+    }
+
+    if (missingFuncFlag == 1)
+    {
+        printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+        return -1;
+    }
+
+    // Display measured times
+
+    gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+    cout << "\nGPU Time - BatchPD : " << gpu_time_used;
+    printf("\n");
+
+    // Reconvert other bit depths to 8u for output display purposes
+
+    string fileName = std::to_string(ip_bitDepth);
+    ofstream outputFile (fileName + ".csv");
+
+    if (ip_bitDepth == 0)
+    {
+        hipMemcpy(output, d_output, oBufferSize * sizeof(Rpp8u), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32u) *outputTemp << ",";
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 1) || (ip_bitDepth == 3))
+    {
+        hipMemcpy(outputf16, d_outputf16, oBufferSize * sizeof(half), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        half *outputf16Temp;
+        outputf16Temp = outputf16;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (char) *outputf16Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK((float)*outputf16Temp * 255.0);
+                outputf16Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 2) || (ip_bitDepth == 4))
+    {
+        hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp32f *outputf32Temp;
+        outputf32Temp = outputf32;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf32Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0);
+                outputf32Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 5) || (ip_bitDepth == 6))
+    {
+        hipMemcpy(outputi8, d_outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp8s *outputi8Temp;
+        outputi8Temp = outputi8;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32s) *outputi8Temp << ",";
+                *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128);
+                outputi8Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+
+    // Calculate exact dstROI in XYWH format for OpenCV dump
+
+    if (roiTypeSrc == RpptRoiType::LTRB)
+    {
+        for (int i = 0; i < dstDescPtr->n; i++)
+        {
+            int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x;
+            int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y;
+            int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x;
+            int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y;
+
+            roiTensorPtrSrc[i].xywhROI.xy.x = ltX;
+            roiTensorPtrSrc[i].xywhROI.xy.y = ltY;
+            roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1;
+            roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1;
+        }
+    }
+
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h;
+
+    for (int i = 0; i < dstDescPtr->n; i++)
+    {
+        roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth);
+        roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight);
+        roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x);
+        roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y);
+    }
+
+    rppDestroyGPU(handle);
+
+    // OpenCV dump
+
+    mkdir(dst, 0700);
+    strcat(dst, "/");
+    count = 0;
+    elementsInRowMax = dstDescPtr->w * ip_channel;
+
+    for (j = 0; j < dstDescPtr->n; j++)
+    {
+        int height = roiTensorPtrSrc[j].xywhROI.roiHeight;
+        int width = roiTensorPtrSrc[j].xywhROI.roiWidth;
+
+        int op_size = height * width * ip_channel;
+        Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u));
+        Rpp8u *temp_output_row;
+        temp_output_row = temp_output;
+        Rpp32u elementsInRow = width * ip_channel;
+        Rpp8u *output_row = output + count;
+
+        for (int k = 0; k < height; k++)
+        {
+            memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u));
+            temp_output_row += elementsInRow;
+            output_row += elementsInRowMax;
+        }
+        count += dstDescPtr->strides.nStride;
+
+        char temp[1000];
+        strcpy(temp, dst);
+        strcat(temp, imageNames[j]);
+
+        Mat mat_op_image;
+        mat_op_image = Mat(height, width, CV_8UC1, temp_output);
+        imwrite(temp, mat_op_image);
+
+        free(temp_output);
+    }
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    hipFree(d_roiTensorPtrSrc);
+    hipFree(d_roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+
+    if (ip_bitDepth == 0)
+    {
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_output);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        free(inputf16);
+        free(inputf16_second);
+        free(outputf16);
+        hipFree(d_inputf16);
+        hipFree(d_inputf16_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        free(inputf32);
+        free(inputf32_second);
+        free(outputf32);
+        hipFree(d_inputf32);
+        hipFree(d_inputf32_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        free(outputf16);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        free(outputf32);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        free(inputi8);
+        free(inputi8_second);
+        free(outputi8);
+        hipFree(d_inputi8);
+        hipFree(d_inputi8_second);
+        hipFree(d_outputi8);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        free(outputi8);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputi8);
+    }
+
+    return 0;
+}
diff --git a/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp
new file mode 100644
index 000000000..e7aaed7c1
--- /dev/null
+++ b/utilities/rpp-unittests/HIP_NEW/Tensor_hip_pln3.cpp
@@ -0,0 +1,907 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <hip/hip_fp16.h>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 8;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_hip_pln3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <dst folder> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[7]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\ndst = %s", argv[3]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
+        printf("\ncase number (1:7) = %s", argv[6]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    char *dst = argv[3];
+    int ip_bitDepth = atoi(argv[4]);
+    unsigned int outputFormatToggle = atoi(argv[5]);
+    int test_case = atoi(argv[6]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HIP_PLN3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        // outputFormatToggle = 0;
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+    printf("\nRunning %s...", func);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+    strcat(dst, "/");
+    strcat(dst, funcName);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    RpptROI *d_roiTensorPtrSrc, *d_roiTensorPtrDst;
+    hipMalloc(&d_roiTensorPtrSrc, noOfImages * sizeof(RpptROI));
+    hipMalloc(&d_roiTensorPtrDst, noOfImages * sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values, n/c/h/w strides for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert default OpenCV PKD3 to PLN3 for first input batch
+
+    Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputTemp, *inputCopyTemp;
+    inputTemp = input;
+    inputCopyTemp = inputCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputTempR, *inputTempG, *inputTempB;
+        inputTempR = inputTemp;
+        inputTempG = inputTempR + srcDescPtr->strides.cStride;
+        inputTempB = inputTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputTempR = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempR++;
+                *inputTempG = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempG++;
+                *inputTempB = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempB++;
+            }
+        }
+
+        inputTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputCopy);
+
+    // Convert default OpenCV PKD3 to PLN3 for second input batch
+
+    Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputSecondTemp, *inputSecondCopyTemp;
+    inputSecondTemp = input_second;
+    inputSecondCopyTemp = inputSecondCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB;
+        inputSecondTempR = inputSecondTemp;
+        inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride;
+        inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputSecondTempR = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempR++;
+                *inputSecondTempG = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempG++;
+                *inputSecondTempB = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempB++;
+            }
+        }
+
+        inputSecondTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputSecondCopy);
+
+    // Convert inputs to test various other bit depths and copy to hip buffers
+
+    half *inputf16, *inputf16_second, *outputf16;
+    Rpp32f *inputf32, *inputf32_second, *outputf32;
+    Rpp8s *inputi8, *inputi8_second, *outputi8;
+    int *d_input, *d_input_second, *d_inputf16, *d_inputf16_second, *d_inputf32, *d_inputf32_second, *d_inputi8, *d_inputi8_second;
+    int *d_output, *d_outputf16, *d_outputf32, *d_outputi8;
+
+    if (ip_bitDepth == 0)
+    {
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_output, oBufferSize * sizeof(Rpp8u));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_output, output, oBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        inputf16 = (half *)calloc(ioBufferSize, sizeof(half));
+        inputf16_second = (half *)calloc(ioBufferSize, sizeof(half));
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        half *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = (half)(((float)*inputTemp) / 255.0);
+            *inputf16_secondTemp = (half)(((float)*input_secondTemp) / 255.0);
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf16, ioBufferSize * sizeof(half));
+        hipMalloc(&d_inputf16_second, ioBufferSize * sizeof(half));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_inputf16, inputf16, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf16_second, inputf16_second, ioBufferSize * sizeof(half), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+
+        hipMalloc(&d_inputf32, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_inputf32_second, ioBufferSize * sizeof(Rpp32f));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_inputf32, inputf32, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputf32_second, inputf32_second, ioBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        outputf16 = (half *)calloc(oBufferSize, sizeof(half));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf16, oBufferSize * sizeof(half));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf16, outputf16, oBufferSize * sizeof(half), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        outputf32 = (Rpp32f *)calloc(oBufferSize, sizeof(Rpp32f));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputf32, oBufferSize * sizeof(Rpp32f));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputf32, outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+
+        hipMalloc(&d_inputi8, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_inputi8_second, ioBufferSize * sizeof(Rpp8s));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_inputi8, inputi8, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_inputi8_second, inputi8_second, ioBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        outputi8 = (Rpp8s *)calloc(oBufferSize, sizeof(Rpp8s));
+        hipMalloc(&d_input, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_input_second, ioBufferSize * sizeof(Rpp8u));
+        hipMalloc(&d_outputi8, oBufferSize * sizeof(Rpp8s));
+        hipMemcpy(d_input, input, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_input_second, input_second, ioBufferSize * sizeof(Rpp8u), hipMemcpyHostToDevice);
+        hipMemcpy(d_outputi8, outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyHostToDevice);
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    rppCreateWithStreamAndBatchSize(&handle, stream, noOfImages);
+
+    clock_t start, end;
+    double gpu_time_used;
+
+    string test_case_name;
+
+    switch (test_case)
+    {
+    case 0:
+    {
+        test_case_name = "brightness";
+
+        Rpp32f alpha[images];
+        Rpp32f beta[images];
+        for (i = 0; i < images; i++)
+        {
+            alpha[i] = 1.75;
+            beta[i] = 50;
+
+            // xywhROI override sample
+            // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+            // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+            // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+            // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+            // ltrbROI override sample
+            // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+            // roiTensorPtrSrc[i].ltrbROI.lt.y = 30;
+            // roiTensorPtrSrc[i].ltrbROI.rb.x = 210;
+            // roiTensorPtrSrc[i].ltrbROI.rb.y = 210;
+        }
+
+        // Change RpptRoiType for ltrbROI override sample
+        // roiTypeSrc = RpptRoiType::LTRB;
+        // roiTypeDst = RpptRoiType::LTRB;
+
+        hipMemcpy(d_roiTensorPtrSrc, roiTensorPtrSrc, images * sizeof(RpptROI), hipMemcpyHostToDevice);
+
+        start = clock();
+
+        if (ip_bitDepth == 0)
+            rppt_brightness_gpu(d_input, srcDescPtr, d_output, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 1)
+            rppt_brightness_gpu(d_inputf16, srcDescPtr, d_outputf16, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 2)
+            rppt_brightness_gpu(d_inputf32, srcDescPtr, d_outputf32, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 3)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 4)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 5)
+            rppt_brightness_gpu(d_inputi8, srcDescPtr, d_outputi8, dstDescPtr, alpha, beta, d_roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 6)
+            missingFuncFlag = 1;
+        else
+            missingFuncFlag = 1;
+
+        end = clock();
+
+        break;
+    }
+    default:
+        missingFuncFlag = 1;
+        break;
+    }
+
+    if (missingFuncFlag == 1)
+    {
+        printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+        return -1;
+    }
+
+    // Display measured times
+
+    gpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+    cout << "\nGPU Time - BatchPD : " << gpu_time_used;
+    printf("\n");
+
+    // Reconvert other bit depths to 8u for output display purposes
+
+    string fileName = std::to_string(ip_bitDepth);
+    ofstream outputFile (fileName + ".csv");
+
+    if (ip_bitDepth == 0)
+    {
+        hipMemcpy(output, d_output, oBufferSize * sizeof(Rpp8u), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32u) *outputTemp << ",";
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 1) || (ip_bitDepth == 3))
+    {
+        hipMemcpy(outputf16, d_outputf16, oBufferSize * sizeof(half), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        half *outputf16Temp;
+        outputf16Temp = outputf16;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (char) *outputf16Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK((float)*outputf16Temp * 255.0);
+                outputf16Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 2) || (ip_bitDepth == 4))
+    {
+        hipMemcpy(outputf32, d_outputf32, oBufferSize * sizeof(Rpp32f), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp32f *outputf32Temp;
+        outputf32Temp = outputf32;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf32Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0);
+                outputf32Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 5) || (ip_bitDepth == 6))
+    {
+        hipMemcpy(outputi8, d_outputi8, oBufferSize * sizeof(Rpp8s), hipMemcpyDeviceToHost);
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp8s *outputi8Temp;
+        outputi8Temp = outputi8;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32s) *outputi8Temp << ",";
+                *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128);
+                outputi8Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+
+    // Calculate exact dstROI in XYWH format for OpenCV dump
+
+    if (roiTypeSrc == RpptRoiType::LTRB)
+    {
+        for (int i = 0; i < dstDescPtr->n; i++)
+        {
+            int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x;
+            int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y;
+            int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x;
+            int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y;
+
+            roiTensorPtrSrc[i].xywhROI.xy.x = ltX;
+            roiTensorPtrSrc[i].xywhROI.xy.y = ltY;
+            roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1;
+            roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1;
+        }
+    }
+
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h;
+
+    for (int i = 0; i < dstDescPtr->n; i++)
+    {
+        roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth);
+        roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight);
+        roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x);
+        roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y);
+    }
+
+    // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump
+
+    if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+        memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u));
+
+        Rpp8u *outputTemp, *outputCopyTemp;
+        outputTemp = output;
+        outputCopyTemp = outputCopy;
+
+        for (int count = 0; count < dstDescPtr->n; count++)
+        {
+            Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB;
+            outputCopyTempR = outputCopyTemp;
+            outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride;
+            outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride;
+
+            for (int i = 0; i < dstDescPtr->h; i++)
+            {
+                for (int j = 0; j < dstDescPtr->w; j++)
+                {
+                    *outputTemp = *outputCopyTempR;
+                    outputTemp++;
+                    outputCopyTempR++;
+                    *outputTemp = *outputCopyTempG;
+                    outputTemp++;
+                    outputCopyTempG++;
+                    *outputTemp = *outputCopyTempB;
+                    outputTemp++;
+                    outputCopyTempB++;
+                }
+            }
+
+            outputCopyTemp += dstDescPtr->strides.nStride;
+        }
+
+        free(outputCopy);
+    }
+
+    rppDestroyGPU(handle);
+
+    // OpenCV dump
+
+    mkdir(dst, 0700);
+    strcat(dst, "/");
+    count = 0;
+    elementsInRowMax = dstDescPtr->w * ip_channel;
+
+    for (j = 0; j < dstDescPtr->n; j++)
+    {
+        int height = roiTensorPtrSrc[j].xywhROI.roiHeight;
+        int width = roiTensorPtrSrc[j].xywhROI.roiWidth;
+
+        int op_size = height * width * ip_channel;
+        Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u));
+        Rpp8u *temp_output_row;
+        temp_output_row = temp_output;
+        Rpp32u elementsInRow = width * ip_channel;
+        Rpp8u *output_row = output + count;
+
+        for (int k = 0; k < height; k++)
+        {
+            memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u));
+            temp_output_row += elementsInRow;
+            output_row += elementsInRowMax;
+        }
+        count += dstDescPtr->strides.nStride;
+
+        char temp[1000];
+        strcpy(temp, dst);
+        strcat(temp, imageNames[j]);
+
+        Mat mat_op_image;
+        mat_op_image = Mat(height, width, CV_8UC3, temp_output);
+        imwrite(temp, mat_op_image);
+
+        free(temp_output);
+    }
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    hipFree(d_roiTensorPtrSrc);
+    hipFree(d_roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+
+    if (ip_bitDepth == 0)
+    {
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_output);
+    }
+    else if (ip_bitDepth == 1)
+    {
+        free(inputf16);
+        free(inputf16_second);
+        free(outputf16);
+        hipFree(d_inputf16);
+        hipFree(d_inputf16_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 2)
+    {
+        free(inputf32);
+        free(inputf32_second);
+        free(outputf32);
+        hipFree(d_inputf32);
+        hipFree(d_inputf32_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 3)
+    {
+        free(outputf16);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf16);
+    }
+    else if (ip_bitDepth == 4)
+    {
+        free(outputf32);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputf32);
+    }
+    else if (ip_bitDepth == 5)
+    {
+        free(inputi8);
+        free(inputi8_second);
+        free(outputi8);
+        hipFree(d_inputi8);
+        hipFree(d_inputi8_second);
+        hipFree(d_outputi8);
+    }
+    else if (ip_bitDepth == 6)
+    {
+        free(outputi8);
+        hipFree(d_input);
+        hipFree(d_input_second);
+        hipFree(d_outputi8);
+    }
+
+    return 0;
+}
diff --git a/utilities/rpp-unittests/HIP_NEW/testAllScript.sh b/utilities/rpp-unittests/HIP_NEW/testAllScript.sh
index f39ed8bfb..0faaa34c7 100755
--- a/utilities/rpp-unittests/HIP_NEW/testAllScript.sh
+++ b/utilities/rpp-unittests/HIP_NEW/testAllScript.sh
@@ -196,6 +196,10 @@ do
 
             printf "\n./BatchPD_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
+            printf "\n./Tensor_hip_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_hip_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -252,6 +256,10 @@ do
 
             printf "\n./BatchPD_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
+            printf "\n./Tensor_hip_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_hip_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -308,6 +316,10 @@ do
 
             printf "\n./BatchPD_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
+            printf "\n./Tensor_hip_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_hip_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
diff --git a/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt b/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt
index 15d7f51ca..0454e3c13 100644
--- a/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt
+++ b/utilities/rpp-unittests/HOST_NEW/CMakeLists.txt
@@ -7,7 +7,7 @@ find_package(OpenCV REQUIRED)
 find_package(AMDRPP QUIET)
 
 if(NOT OpenCL_FOUND)
-	message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}")
+    message("-- ${Yellow}Rpp_test requires OpenCL, Found ${OpenCL_INCLUDE_DIRS} ${OpenCL_LIBRARIES} ${ColourReset}")
 endif()
 
 if (OpenCL_FOUND)
@@ -19,12 +19,18 @@ if (OpenCL_FOUND)
     add_executable(BatchPD_host_pkd3 BatchPD_host_pkd3.cpp)
     add_executable(BatchPD_host_pln1 BatchPD_host_pln1.cpp)
     add_executable(BatchPD_host_pln3 BatchPD_host_pln3.cpp)
+    add_executable(Tensor_host_pkd3 Tensor_host_pkd3.cpp)
+    add_executable(Tensor_host_pln3 Tensor_host_pln3.cpp)
+    add_executable(Tensor_host_pln1 Tensor_host_pln1.cpp)
     # add_executable(Single_host Single_host.cpp)
     add_executable(uniqueFunctionalities_host uniqueFunctionalities_host.cpp)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -DOCL_COMPILE=1 -DRPP_BACKEND_OPENCL=1 -std=c++11")
     target_link_libraries(BatchPD_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
     target_link_libraries(BatchPD_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
     target_link_libraries(BatchPD_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
+    target_link_libraries(Tensor_host_pkd3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
+    target_link_libraries(Tensor_host_pln3 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
+    target_link_libraries(Tensor_host_pln1 ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
     # target_link_libraries(Single_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system )
     target_link_libraries(uniqueFunctionalities_host ${OpenCV_LIBS} -lamd_rpp ${OpenCL_LIBRARIES} pthread boost_filesystem boost_system)
 endif()
\ No newline at end of file
diff --git a/utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp
new file mode 100644
index 000000000..2ef7860c2
--- /dev/null
+++ b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pkd3.cpp
@@ -0,0 +1,710 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <half.hpp>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+using half_float::half;
+
+typedef half Rpp16f;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 8;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pkd3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <dst folder> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[7]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\ndst = %s", argv[3]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
+        printf("\ncase number (1:7) = %s", argv[6]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    char *dst = argv[3];
+    int ip_bitDepth = atoi(argv[4]);
+    unsigned int outputFormatToggle = atoi(argv[5]);
+    int test_case = atoi(argv[6]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PKD3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NHWC;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+    printf("\nRunning %s...", func);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+    strcat(dst, "/");
+    strcat(dst, funcName);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+    srcDescPtr->c = ip_channel;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+    dstDescPtr->c = ip_channel;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = ip_channel * srcDescPtr->w;
+    srcDescPtr->strides.wStride = ip_channel;
+    srcDescPtr->strides.cStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+
+    Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+
+    Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += srcDescPtr->strides.hStride;
+            input_second_temp += srcDescPtr->strides.hStride;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths
+
+    if (ip_bitDepth == 1)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp16f *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0;
+            *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 2)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 5)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, noOfImages);
+    clock_t start, end;
+    double start_omp, end_omp;
+    double cpu_time_used, omp_time_used;
+
+    string test_case_name;
+
+    switch (test_case)
+    {
+    case 0:
+    {
+        test_case_name = "brightness";
+
+        Rpp32f alpha[images];
+        Rpp32f beta[images];
+        for (i = 0; i < images; i++)
+        {
+            alpha[i] = 1.75;
+            beta[i] = 50;
+
+            // xywhROI override sample
+            // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+            // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+            // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+            // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+            // ltrbROI override sample
+            // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+            // roiTensorPtrSrc[i].ltrbROI.lt.y = 50;
+            // roiTensorPtrSrc[i].ltrbROI.rb.x = 199;
+            // roiTensorPtrSrc[i].ltrbROI.rb.y = 149;
+        }
+
+        // Change RpptRoiType for ltrbROI override sample
+        // roiTypeSrc = RpptRoiType::LTRB;
+        // roiTypeDst = RpptRoiType::LTRB;
+
+        start_omp = omp_get_wtime();
+        start = clock();
+        if (ip_bitDepth == 0)
+            rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 1)
+            rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 2)
+            rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 3)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 4)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 5)
+            rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 6)
+            missingFuncFlag = 1;
+        else
+            missingFuncFlag = 1;
+        end = clock();
+        end_omp = omp_get_wtime();
+
+        break;
+    }
+    default:
+        missingFuncFlag = 1;
+        break;
+    }
+
+    if (missingFuncFlag == 1)
+    {
+        printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+        return -1;
+    }
+
+    // Display measured times
+
+    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+    omp_time_used = end_omp - start_omp;
+    cout << "\nCPU Time - BatchPD : " << cpu_time_used;
+    cout << "\nOMP Time - BatchPD : " << omp_time_used;
+    printf("\n");
+
+    // Reconvert other bit depths to 8u for output display purposes
+
+    string fileName = std::to_string(ip_bitDepth);
+    ofstream outputFile (fileName + ".csv");
+
+    if (ip_bitDepth == 0)
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32u) *outputTemp << ",";
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+
+    }
+    else if ((ip_bitDepth == 1) || (ip_bitDepth == 3))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp16f *outputf16Temp;
+        outputf16Temp = outputf16;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf16Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf16Temp * 255.0);
+                outputf16Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+
+    }
+    else if ((ip_bitDepth == 2) || (ip_bitDepth == 4))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp32f *outputf32Temp;
+        outputf32Temp = outputf32;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf32Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0);
+                outputf32Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 5) || (ip_bitDepth == 6))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp8s *outputi8Temp;
+        outputi8Temp = outputi8;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32s) *outputi8Temp << ",";
+                *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128);
+                outputi8Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+
+    // Calculate exact dstROI in XYWH format for OpenCV dump
+
+    if (roiTypeSrc == RpptRoiType::LTRB)
+    {
+        for (int i = 0; i < dstDescPtr->n; i++)
+        {
+            int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x;
+            int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y;
+            int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x;
+            int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y;
+
+            roiTensorPtrSrc[i].xywhROI.xy.x = ltX;
+            roiTensorPtrSrc[i].xywhROI.xy.y = ltY;
+            roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1;
+            roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1;
+        }
+    }
+
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h;
+
+    for (int i = 0; i < dstDescPtr->n; i++)
+    {
+        roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth);
+        roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight);
+        roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x);
+        roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y);
+    }
+
+    // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump
+
+    if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+        memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u));
+
+        Rpp8u *outputTemp, *outputCopyTemp;
+        outputTemp = output;
+        outputCopyTemp = outputCopy;
+
+        for (int count = 0; count < dstDescPtr->n; count++)
+        {
+            Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB;
+            outputCopyTempR = outputCopyTemp;
+            outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride;
+            outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride;
+
+            for (int i = 0; i < dstDescPtr->h; i++)
+            {
+                for (int j = 0; j < dstDescPtr->w; j++)
+                {
+                    *outputTemp = *outputCopyTempR;
+                    outputTemp++;
+                    outputCopyTempR++;
+                    *outputTemp = *outputCopyTempG;
+                    outputTemp++;
+                    outputCopyTempG++;
+                    *outputTemp = *outputCopyTempB;
+                    outputTemp++;
+                    outputCopyTempB++;
+                }
+            }
+
+            outputCopyTemp += dstDescPtr->strides.nStride;
+        }
+
+        free(outputCopy);
+    }
+
+    rppDestroyHost(handle);
+
+    // OpenCV dump
+
+    mkdir(dst, 0700);
+    strcat(dst, "/");
+    count = 0;
+
+    for (j = 0; j < dstDescPtr->n; j++)
+    {
+        int height = roiTensorPtrSrc[j].xywhROI.roiHeight;
+        int width = roiTensorPtrSrc[j].xywhROI.roiWidth;
+
+        int op_size = height * width * ip_channel;
+        Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u));
+        Rpp8u *temp_output_row;
+        temp_output_row = temp_output;
+        Rpp32u elementsInRow = width * ip_channel;
+        Rpp8u *output_row = output + count;
+
+        for (int k = 0; k < height; k++)
+        {
+            memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u));
+            temp_output_row += elementsInRow;
+            output_row += srcDescPtr->strides.hStride;
+        }
+        count += dstDescPtr->strides.nStride;
+
+        char temp[1000];
+        strcpy(temp, dst);
+        strcat(temp, imageNames[j]);
+
+        Mat mat_op_image;
+        mat_op_image = Mat(height, width, CV_8UC3, temp_output);
+        imwrite(temp, mat_op_image);
+
+        free(temp_output);
+    }
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+    free(inputf16);
+    free(inputf16_second);
+    free(outputf16);
+    free(inputf32);
+    free(inputf32_second);
+    free(outputf32);
+    free(inputi8);
+    free(inputi8_second);
+    free(outputi8);
+
+    return 0;
+}
diff --git a/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp
new file mode 100644
index 000000000..66019ecf4
--- /dev/null
+++ b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln1.cpp
@@ -0,0 +1,669 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <half.hpp>
+#include <fstream>
+#include "helpers/testSuite_helper.hpp"
+
+using namespace cv;
+using namespace std;
+using half_float::half;
+
+typedef half Rpp16f;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 8;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pln1 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <dst folder> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+    if (atoi(argv[5]) != 0)
+    {
+        printf("\nPLN1 cases don't have outputFormatToggle! Please input outputFormatToggle = 0\n");
+        return -1;
+    }
+
+    if (atoi(argv[7]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\ndst = %s", argv[3]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
+        printf("\ncase number (1:7) = %s", argv[6]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    char *dst = argv[3];
+    int ip_bitDepth = atoi(argv[4]);
+    unsigned int outputFormatToggle = atoi(argv[5]);
+    int test_case = atoi(argv[6]);
+
+    int ip_channel = 1;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PLN1_toPLN1"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    srcDescPtr->layout = RpptLayout::NCHW;
+    dstDescPtr->layout = RpptLayout::NCHW;
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+    printf("\nRunning %s...", func);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+    strcat(dst, "/");
+    strcat(dst, funcName);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 0);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+
+    Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+
+    Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 0);
+        image_second = imread(temp_second, 0);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert inputs to test various other bit depths
+
+    if (ip_bitDepth == 1)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp16f *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0;
+            *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 2)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 5)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, noOfImages);
+    clock_t start, end;
+    double start_omp, end_omp;
+    double cpu_time_used, omp_time_used;
+
+    string test_case_name;
+
+    switch (test_case)
+    {
+    case 0:
+    {
+        test_case_name = "brightness";
+
+        Rpp32f alpha[images];
+        Rpp32f beta[images];
+        for (i = 0; i < images; i++)
+        {
+            alpha[i] = 1.75;
+            beta[i] = 50;
+
+            // xywhROI override sample
+            // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+            // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+            // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+            // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+            // ltrbROI override sample
+            // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+            // roiTensorPtrSrc[i].ltrbROI.lt.y = 50;
+            // roiTensorPtrSrc[i].ltrbROI.rb.x = 199;
+            // roiTensorPtrSrc[i].ltrbROI.rb.y = 149;
+        }
+
+        // Change RpptRoiType for ltrbROI override sample
+        // roiTypeSrc = RpptRoiType::LTRB;
+        // roiTypeDst = RpptRoiType::LTRB;
+
+        start_omp = omp_get_wtime();
+        start = clock();
+        if (ip_bitDepth == 0)
+            rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 1)
+            rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 2)
+            rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 3)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 4)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 5)
+            rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 6)
+            missingFuncFlag = 1;
+        else
+            missingFuncFlag = 1;
+        end = clock();
+        end_omp = omp_get_wtime();
+
+        break;
+    }
+    default:
+        missingFuncFlag = 1;
+        break;
+    }
+
+    if (missingFuncFlag == 1)
+    {
+        printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+        return -1;
+    }
+
+    // Display measured times
+
+    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+    omp_time_used = end_omp - start_omp;
+    cout << "\nCPU Time - BatchPD : " << cpu_time_used;
+    cout << "\nOMP Time - BatchPD : " << omp_time_used;
+    printf("\n");
+
+    // Reconvert other bit depths to 8u for output display purposes
+
+    string fileName = std::to_string(ip_bitDepth);
+    ofstream outputFile (fileName + ".csv");
+
+    if (ip_bitDepth == 0)
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32u) *outputTemp << ",";
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+
+    }
+    else if ((ip_bitDepth == 1) || (ip_bitDepth == 3))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp16f *outputf16Temp;
+        outputf16Temp = outputf16;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf16Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf16Temp * 255.0);
+                outputf16Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+
+    }
+    else if ((ip_bitDepth == 2) || (ip_bitDepth == 4))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp32f *outputf32Temp;
+        outputf32Temp = outputf32;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf32Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0);
+                outputf32Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 5) || (ip_bitDepth == 6))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp8s *outputi8Temp;
+        outputi8Temp = outputi8;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32s) *outputi8Temp << ",";
+                *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128);
+                outputi8Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+
+    // Calculate exact dstROI in XYWH format for OpenCV dump
+
+    if (roiTypeSrc == RpptRoiType::LTRB)
+    {
+        for (int i = 0; i < dstDescPtr->n; i++)
+        {
+            int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x;
+            int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y;
+            int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x;
+            int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y;
+
+            roiTensorPtrSrc[i].xywhROI.xy.x = ltX;
+            roiTensorPtrSrc[i].xywhROI.xy.y = ltY;
+            roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1;
+            roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1;
+        }
+    }
+
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h;
+
+    for (int i = 0; i < dstDescPtr->n; i++)
+    {
+        roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth);
+        roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight);
+        roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x);
+        roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y);
+    }
+
+    rppDestroyHost(handle);
+
+    // OpenCV dump
+
+    mkdir(dst, 0700);
+    strcat(dst, "/");
+    count = 0;
+    elementsInRowMax = dstDescPtr->w * ip_channel;
+
+    for (j = 0; j < dstDescPtr->n; j++)
+    {
+        int height = roiTensorPtrSrc[j].xywhROI.roiHeight;
+        int width = roiTensorPtrSrc[j].xywhROI.roiWidth;
+
+        int op_size = height * width * ip_channel;
+        Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u));
+        Rpp8u *temp_output_row;
+        temp_output_row = temp_output;
+        Rpp32u elementsInRow = width * ip_channel;
+        Rpp8u *output_row = output + count;
+
+        for (int k = 0; k < height; k++)
+        {
+            memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u));
+            temp_output_row += elementsInRow;
+            output_row += elementsInRowMax;
+        }
+        count += dstDescPtr->strides.nStride;
+
+        char temp[1000];
+        strcpy(temp, dst);
+        strcat(temp, imageNames[j]);
+
+        Mat mat_op_image;
+        mat_op_image = Mat(height, width, CV_8UC1, temp_output);
+        imwrite(temp, mat_op_image);
+
+        free(temp_output);
+    }
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+    free(inputf16);
+    free(inputf16_second);
+    free(outputf16);
+    free(inputf32);
+    free(inputf32_second);
+    free(outputf32);
+    free(inputi8);
+    free(inputi8_second);
+    free(outputi8);
+
+    return 0;
+}
diff --git a/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp
new file mode 100644
index 000000000..13435e7fa
--- /dev/null
+++ b/utilities/rpp-unittests/HOST_NEW/Tensor_host_pln3.cpp
@@ -0,0 +1,787 @@
+#include <stdio.h>
+#include <dirent.h>
+#include <string.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include "/opt/rocm/rpp/include/rpp.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <time.h>
+#include <omp.h>
+#include <half.hpp>
+#include <fstream>
+
+using namespace cv;
+using namespace std;
+using half_float::half;
+
+typedef half Rpp16f;
+
+#define RPPPIXELCHECK(pixel) (pixel < (Rpp32f)0) ? ((Rpp32f)0) : ((pixel < (Rpp32f)255) ? pixel : ((Rpp32f)255))
+#define RPPMAX2(a,b) ((a > b) ? a : b)
+#define RPPMIN2(a,b) ((a < b) ? a : b)
+
+int main(int argc, char **argv)
+{
+    // Handle inputs
+
+    const int MIN_ARG_COUNT = 8;
+
+    if (argc < MIN_ARG_COUNT)
+    {
+        printf("\nImproper Usage! Needs all arguments!\n");
+        printf("\nUsage: ./Tensor_host_pln3 <src1 folder> <src2 folder (place same as src1 folder for single image functionalities)> <dst folder> <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <outputFormatToggle (pkd->pkd = 0 / pkd->pln = 1)> <case number = 0:81> <verbosity = 0/1>\n");
+        return -1;
+    }
+
+    if (atoi(argv[7]) == 1)
+    {
+        printf("\nInputs for this test case are:");
+        printf("\nsrc1 = %s", argv[1]);
+        printf("\nsrc2 = %s", argv[2]);
+        printf("\ndst = %s", argv[3]);
+        printf("\nu8 / f16 / f32 / u8->f16 / u8->f32 / i8 / u8->i8 (0/1/2/3/4/5/6) = %s", argv[4]);
+        printf("\noutputFormatToggle (pkd->pkd = 0 / pkd->pln = 1) = %s", argv[5]);
+        printf("\ncase number (1:7) = %s", argv[6]);
+    }
+
+    char *src = argv[1];
+    char *src_second = argv[2];
+    char *dst = argv[3];
+    int ip_bitDepth = atoi(argv[4]);
+    unsigned int outputFormatToggle = atoi(argv[5]);
+    int test_case = atoi(argv[6]);
+
+    int ip_channel = 3;
+
+    // Set case names
+
+    char funcType[1000] = {"Tensor_HOST_PLN3"};
+
+    char funcName[1000];
+    switch (test_case)
+    {
+    case 0:
+        strcpy(funcName, "brightness");
+        break;
+    }
+
+    // Initialize tensor descriptors
+
+    RpptDesc srcDesc, dstDesc;
+    RpptDescPtr srcDescPtr, dstDescPtr;
+    srcDescPtr = &srcDesc;
+    dstDescPtr = &dstDesc;
+
+    // Set src/dst layouts in tensor descriptors
+
+    if (outputFormatToggle == 0)
+    {
+        strcat(funcType, "_toPLN3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NCHW;
+    }
+    else if (outputFormatToggle == 1)
+    {
+        strcat(funcType, "_toPKD3");
+        srcDescPtr->layout = RpptLayout::NCHW;
+        dstDescPtr->layout = RpptLayout::NHWC;
+    }
+
+    // Set src/dst data types in tensor descriptors
+
+    if (ip_bitDepth == 0)
+    {
+        strcat(funcName, "_u8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::U8;
+    }
+    else if (ip_bitDepth == 1)
+    {
+        strcat(funcName, "_f16_");
+        srcDescPtr->dataType = RpptDataType::F16;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 2)
+    {
+        strcat(funcName, "_f32_");
+        srcDescPtr->dataType = RpptDataType::F32;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 3)
+    {
+        strcat(funcName, "_u8_f16_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F16;
+    }
+    else if (ip_bitDepth == 4)
+    {
+        strcat(funcName, "_u8_f32_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::F32;
+    }
+    else if (ip_bitDepth == 5)
+    {
+        strcat(funcName, "_i8_");
+        srcDescPtr->dataType = RpptDataType::I8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+    else if (ip_bitDepth == 6)
+    {
+        strcat(funcName, "_u8_i8_");
+        srcDescPtr->dataType = RpptDataType::U8;
+        dstDescPtr->dataType = RpptDataType::I8;
+    }
+
+    // Other initializations
+
+    int missingFuncFlag = 0;
+    int i = 0, j = 0;
+    int maxHeight = 0, maxWidth = 0;
+    int maxDstHeight = 0, maxDstWidth = 0;
+    unsigned long long count = 0;
+    unsigned long long ioBufferSize = 0;
+    unsigned long long oBufferSize = 0;
+    static int noOfImages = 0;
+    Mat image, image_second;
+
+    // String ops on function name
+
+    char func[1000];
+    strcpy(func, funcName);
+    strcat(func, funcType);
+    printf("\nRunning %s...", func);
+
+    char src1[1000];
+    strcpy(src1, src);
+    strcat(src1, "/");
+
+    char src1_second[1000];
+    strcpy(src1_second, src_second);
+    strcat(src1_second, "/");
+
+    strcat(funcName, funcType);
+    strcat(dst, "/");
+    strcat(dst, funcName);
+
+    // Get number of images
+
+    struct dirent *de;
+    DIR *dr = opendir(src);
+    while ((de = readdir(dr)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        noOfImages += 1;
+    }
+    closedir(dr);
+
+    // Initialize ROI tensors for src/dst
+
+    RpptROI *roiTensorPtrSrc = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+    RpptROI *roiTensorPtrDst = (RpptROI *) calloc(noOfImages, sizeof(RpptROI));
+
+    // Set ROI tensors types for src/dst
+
+    RpptRoiType roiTypeSrc, roiTypeDst;
+    roiTypeSrc = RpptRoiType::XYWH;
+    roiTypeDst = RpptRoiType::XYWH;
+
+    // Set maxHeight, maxWidth and ROIs for src/dst
+
+    const int images = noOfImages;
+    char imageNames[images][1000];
+
+    DIR *dr1 = opendir(src);
+    while ((de = readdir(dr1)) != NULL)
+    {
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+        strcpy(imageNames[count], de->d_name);
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, imageNames[count]);
+
+        image = imread(temp, 1);
+
+        roiTensorPtrSrc[count].xywhROI.xy.x = 0;
+        roiTensorPtrSrc[count].xywhROI.xy.y = 0;
+        roiTensorPtrSrc[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrSrc[count].xywhROI.roiHeight = image.rows;
+
+        roiTensorPtrDst[count].xywhROI.xy.x = 0;
+        roiTensorPtrDst[count].xywhROI.xy.y = 0;
+        roiTensorPtrDst[count].xywhROI.roiWidth = image.cols;
+        roiTensorPtrDst[count].xywhROI.roiHeight = image.rows;
+
+        maxHeight = RPPMAX2(maxHeight, roiTensorPtrSrc[count].xywhROI.roiHeight);
+        maxWidth = RPPMAX2(maxWidth, roiTensorPtrSrc[count].xywhROI.roiWidth);
+        maxDstHeight = RPPMAX2(maxDstHeight, roiTensorPtrDst[count].xywhROI.roiHeight);
+        maxDstWidth = RPPMAX2(maxDstWidth, roiTensorPtrDst[count].xywhROI.roiWidth);
+
+        count++;
+    }
+    closedir(dr1);
+
+    // Set numDims, offset, n/c/h/w values for src/dst
+
+    srcDescPtr->numDims = 4;
+    dstDescPtr->numDims = 4;
+
+    srcDescPtr->offset = 0;
+    dstDescPtr->offset = 0;
+
+    srcDescPtr->n = noOfImages;
+    srcDescPtr->c = ip_channel;
+    srcDescPtr->h = maxHeight;
+    srcDescPtr->w = maxWidth;
+
+    dstDescPtr->n = noOfImages;
+    dstDescPtr->c = ip_channel;
+    dstDescPtr->h = maxDstHeight;
+    dstDescPtr->w = maxDstWidth;
+
+    // Optionally set w stride as a multiple of 8 for src/dst
+
+    srcDescPtr->w = ((srcDescPtr->w / 8) * 8) + 8;
+    dstDescPtr->w = ((dstDescPtr->w / 8) * 8) + 8;
+
+    // Set n/c/h/w strides for src/dst
+
+    srcDescPtr->strides.nStride = ip_channel * srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.cStride = srcDescPtr->w * srcDescPtr->h;
+    srcDescPtr->strides.hStride = srcDescPtr->w;
+    srcDescPtr->strides.wStride = 1;
+
+    if (dstDescPtr->layout == RpptLayout::NHWC)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = ip_channel * dstDescPtr->w;
+        dstDescPtr->strides.wStride = ip_channel;
+        dstDescPtr->strides.cStride = 1;
+    }
+    else if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        dstDescPtr->strides.nStride = ip_channel * dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.cStride = dstDescPtr->w * dstDescPtr->h;
+        dstDescPtr->strides.hStride = dstDescPtr->w;
+        dstDescPtr->strides.wStride = 1;
+    }
+
+    // Set buffer sizes for src/dst
+
+    ioBufferSize = (unsigned long long)srcDescPtr->h * (unsigned long long)srcDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+    oBufferSize = (unsigned long long)dstDescPtr->h * (unsigned long long)dstDescPtr->w * (unsigned long long)ip_channel * (unsigned long long)noOfImages;
+
+    // Initialize host buffers for src/dst
+
+    Rpp8u *input = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *input_second = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    Rpp8u *output = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+
+    Rpp16f *inputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *inputf16_second = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+    Rpp16f *outputf16 = (Rpp16f *)calloc(ioBufferSize, sizeof(Rpp16f));
+
+    Rpp32f *inputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *inputf32_second = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+    Rpp32f *outputf32 = (Rpp32f *)calloc(ioBufferSize, sizeof(Rpp32f));
+
+    Rpp8s *inputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *inputi8_second = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+    Rpp8s *outputi8 = (Rpp8s *)calloc(ioBufferSize, sizeof(Rpp8s));
+
+    // Set 8u host buffers for src/dst
+
+    DIR *dr2 = opendir(src);
+    DIR *dr2_second = opendir(src_second);
+    count = 0;
+    i = 0;
+
+    Rpp32u elementsInRowMax = srcDescPtr->w * ip_channel;
+
+    while ((de = readdir(dr2)) != NULL)
+    {
+        Rpp8u *input_temp, *input_second_temp;
+        input_temp = input + (i * srcDescPtr->strides.nStride);
+        input_second_temp = input_second + (i * srcDescPtr->strides.nStride);
+
+        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+            continue;
+
+        char temp[1000];
+        strcpy(temp, src1);
+        strcat(temp, de->d_name);
+
+        char temp_second[1000];
+        strcpy(temp_second, src1_second);
+        strcat(temp_second, de->d_name);
+
+        image = imread(temp, 1);
+        image_second = imread(temp_second, 1);
+
+        Rpp8u *ip_image = image.data;
+        Rpp8u *ip_image_second = image_second.data;
+
+        Rpp32u elementsInRow = roiTensorPtrSrc[i].xywhROI.roiWidth * ip_channel;
+
+        for (j = 0; j < roiTensorPtrSrc[i].xywhROI.roiHeight; j++)
+        {
+            memcpy(input_temp, ip_image, elementsInRow * sizeof (Rpp8u));
+            memcpy(input_second_temp, ip_image_second, elementsInRow * sizeof (Rpp8u));
+            ip_image += elementsInRow;
+            ip_image_second += elementsInRow;
+            input_temp += elementsInRowMax;
+            input_second_temp += elementsInRowMax;
+        }
+        i++;
+        count += srcDescPtr->strides.nStride;
+    }
+    closedir(dr2);
+
+    // Convert default OpenCV PKD3 to PLN3 for first input batch
+
+    Rpp8u *inputCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputCopy, input, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputTemp, *inputCopyTemp;
+    inputTemp = input;
+    inputCopyTemp = inputCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputTempR, *inputTempG, *inputTempB;
+        inputTempR = inputTemp;
+        inputTempG = inputTempR + srcDescPtr->strides.cStride;
+        inputTempB = inputTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputTempR = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempR++;
+                *inputTempG = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempG++;
+                *inputTempB = *inputCopyTemp;
+                inputCopyTemp++;
+                inputTempB++;
+            }
+        }
+
+        inputTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputCopy);
+
+    // Convert default OpenCV PKD3 to PLN3 for second input batch
+
+    Rpp8u *inputSecondCopy = (Rpp8u *)calloc(ioBufferSize, sizeof(Rpp8u));
+    memcpy(inputSecondCopy, input_second, ioBufferSize * sizeof(Rpp8u));
+
+    Rpp8u *inputSecondTemp, *inputSecondCopyTemp;
+    inputSecondTemp = input_second;
+    inputSecondCopyTemp = inputSecondCopy;
+
+    for (int count = 0; count < noOfImages; count++)
+    {
+        Rpp8u *inputSecondTempR, *inputSecondTempG, *inputSecondTempB;
+        inputSecondTempR = inputSecondTemp;
+        inputSecondTempG = inputSecondTempR + srcDescPtr->strides.cStride;
+        inputSecondTempB = inputSecondTempG + srcDescPtr->strides.cStride;
+
+        for (int i = 0; i < srcDescPtr->h; i++)
+        {
+            for (int j = 0; j < srcDescPtr->w; j++)
+            {
+                *inputSecondTempR = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempR++;
+                *inputSecondTempG = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempG++;
+                *inputSecondTempB = *inputSecondCopyTemp;
+                inputSecondCopyTemp++;
+                inputSecondTempB++;
+            }
+        }
+
+        inputSecondTemp += srcDescPtr->strides.nStride;
+    }
+
+    free(inputSecondCopy);
+
+    // Convert inputs to test various other bit depths
+
+    if (ip_bitDepth == 1)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp16f *inputf16Temp, *inputf16_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf16Temp = inputf16;
+        inputf16_secondTemp = inputf16_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf16Temp = ((Rpp16f)*inputTemp) / 255.0;
+            *inputf16_secondTemp = ((Rpp16f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf16Temp++;
+            input_secondTemp++;
+            inputf16_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 2)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp32f *inputf32Temp, *inputf32_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputf32Temp = inputf32;
+        inputf32_secondTemp = inputf32_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputf32Temp = ((Rpp32f)*inputTemp) / 255.0;
+            *inputf32_secondTemp = ((Rpp32f)*input_secondTemp) / 255.0;
+            inputTemp++;
+            inputf32Temp++;
+            input_secondTemp++;
+            inputf32_secondTemp++;
+        }
+    }
+    else if (ip_bitDepth == 5)
+    {
+        Rpp8u *inputTemp, *input_secondTemp;
+        Rpp8s *inputi8Temp, *inputi8_secondTemp;
+
+        inputTemp = input;
+        input_secondTemp = input_second;
+
+        inputi8Temp = inputi8;
+        inputi8_secondTemp = inputi8_second;
+
+        for (int i = 0; i < ioBufferSize; i++)
+        {
+            *inputi8Temp = (Rpp8s) (((Rpp32s) *inputTemp) - 128);
+            *inputi8_secondTemp = (Rpp8s) (((Rpp32s) *input_secondTemp) - 128);
+            inputTemp++;
+            inputi8Temp++;
+            input_secondTemp++;
+            inputi8_secondTemp++;
+        }
+    }
+
+    // Run case-wise RPP API and measure time
+
+    rppHandle_t handle;
+    rppCreateWithBatchSize(&handle, noOfImages);
+    clock_t start, end;
+    double start_omp, end_omp;
+    double cpu_time_used, omp_time_used;
+
+    string test_case_name;
+
+    switch (test_case)
+    {
+    case 0:
+    {
+        test_case_name = "brightness";
+
+        Rpp32f alpha[images];
+        Rpp32f beta[images];
+        for (i = 0; i < images; i++)
+        {
+            alpha[i] = 1.75;
+            beta[i] = 50;
+
+            // xywhROI override sample
+            // roiTensorPtrSrc[i].xywhROI.xy.x = 0;
+            // roiTensorPtrSrc[i].xywhROI.xy.y = 0;
+            // roiTensorPtrSrc[i].xywhROI.roiWidth = 100;
+            // roiTensorPtrSrc[i].xywhROI.roiHeight = 180;
+
+            // ltrbROI override sample
+            // roiTensorPtrSrc[i].ltrbROI.lt.x = 50;
+            // roiTensorPtrSrc[i].ltrbROI.lt.y = 50;
+            // roiTensorPtrSrc[i].ltrbROI.rb.x = 199;
+            // roiTensorPtrSrc[i].ltrbROI.rb.y = 149;
+        }
+
+        // Change RpptRoiType for ltrbROI override sample
+        // roiTypeSrc = RpptRoiType::LTRB;
+        // roiTypeDst = RpptRoiType::LTRB;
+
+        start_omp = omp_get_wtime();
+        start = clock();
+        if (ip_bitDepth == 0)
+            rppt_brightness_host(input, srcDescPtr, output, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 1)
+            rppt_brightness_host(inputf16, srcDescPtr, outputf16, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 2)
+            rppt_brightness_host(inputf32, srcDescPtr, outputf32, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 3)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 4)
+            missingFuncFlag = 1;
+        else if (ip_bitDepth == 5)
+            rppt_brightness_host(inputi8, srcDescPtr, outputi8, dstDescPtr, alpha, beta, roiTensorPtrSrc, roiTypeSrc, handle);
+        else if (ip_bitDepth == 6)
+            missingFuncFlag = 1;
+        else
+            missingFuncFlag = 1;
+        end = clock();
+        end_omp = omp_get_wtime();
+
+        break;
+    }
+    default:
+        missingFuncFlag = 1;
+        break;
+    }
+
+    if (missingFuncFlag == 1)
+    {
+        printf("\nThe functionality %s doesn't yet exist in RPP\n", func);
+        return -1;
+    }
+
+    // Display measured times
+
+    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
+    omp_time_used = end_omp - start_omp;
+    cout << "\nCPU Time - BatchPD : " << cpu_time_used;
+    cout << "\nOMP Time - BatchPD : " << omp_time_used;
+    printf("\n");
+
+    // Reconvert other bit depths to 8u for output display purposes
+
+    string fileName = std::to_string(ip_bitDepth);
+    ofstream outputFile (fileName + ".csv");
+
+    if (ip_bitDepth == 0)
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32u) *outputTemp << ",";
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+
+    }
+    else if ((ip_bitDepth == 1) || (ip_bitDepth == 3))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp16f *outputf16Temp;
+        outputf16Temp = outputf16;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf16Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf16Temp * 255.0);
+                outputf16Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+
+    }
+    else if ((ip_bitDepth == 2) || (ip_bitDepth == 4))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp32f *outputf32Temp;
+        outputf32Temp = outputf32;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << *outputf32Temp << ",";
+                *outputTemp = (Rpp8u)RPPPIXELCHECK(*outputf32Temp * 255.0);
+                outputf32Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+    else if ((ip_bitDepth == 5) || (ip_bitDepth == 6))
+    {
+        Rpp8u *outputTemp;
+        outputTemp = output;
+        Rpp8s *outputi8Temp;
+        outputi8Temp = outputi8;
+
+        if (outputFile.is_open())
+        {
+            for (int i = 0; i < oBufferSize; i++)
+            {
+                outputFile << (Rpp32s) *outputi8Temp << ",";
+                *outputTemp = (Rpp8u) RPPPIXELCHECK(((Rpp32s) *outputi8Temp) + 128);
+                outputi8Temp++;
+                outputTemp++;
+            }
+            outputFile.close();
+        }
+        else
+            cout << "Unable to open file!";
+    }
+
+    // Calculate exact dstROI in XYWH format for OpenCV dump
+
+    if (roiTypeSrc == RpptRoiType::LTRB)
+    {
+        for (int i = 0; i < dstDescPtr->n; i++)
+        {
+            int ltX = roiTensorPtrSrc[i].ltrbROI.lt.x;
+            int ltY = roiTensorPtrSrc[i].ltrbROI.lt.y;
+            int rbX = roiTensorPtrSrc[i].ltrbROI.rb.x;
+            int rbY = roiTensorPtrSrc[i].ltrbROI.rb.y;
+
+            roiTensorPtrSrc[i].xywhROI.xy.x = ltX;
+            roiTensorPtrSrc[i].xywhROI.xy.y = ltY;
+            roiTensorPtrSrc[i].xywhROI.roiWidth = rbX - ltX + 1;
+            roiTensorPtrSrc[i].xywhROI.roiHeight = rbY - ltY + 1;
+        }
+    }
+
+    RpptROI roiDefault;
+    RpptROIPtr roiPtrDefault;
+    roiPtrDefault = &roiDefault;
+    roiPtrDefault->xywhROI.xy.x = 0;
+    roiPtrDefault->xywhROI.xy.y = 0;
+    roiPtrDefault->xywhROI.roiWidth = dstDescPtr->w;
+    roiPtrDefault->xywhROI.roiHeight = dstDescPtr->h;
+
+    for (int i = 0; i < dstDescPtr->n; i++)
+    {
+        roiTensorPtrSrc[i].xywhROI.roiWidth = RPPMIN2(roiPtrDefault->xywhROI.roiWidth - roiTensorPtrSrc[i].xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.roiWidth);
+        roiTensorPtrSrc[i].xywhROI.roiHeight = RPPMIN2(roiPtrDefault->xywhROI.roiHeight - roiTensorPtrSrc[i].xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.roiHeight);
+        roiTensorPtrSrc[i].xywhROI.xy.x = RPPMAX2(roiPtrDefault->xywhROI.xy.x, roiTensorPtrSrc[i].xywhROI.xy.x);
+        roiTensorPtrSrc[i].xywhROI.xy.y = RPPMAX2(roiPtrDefault->xywhROI.xy.y, roiTensorPtrSrc[i].xywhROI.xy.y);
+    }
+
+    // Convert any PLN3 outputs to the corresponding PKD3 version for OpenCV dump
+
+    if (dstDescPtr->layout == RpptLayout::NCHW)
+    {
+        Rpp8u *outputCopy = (Rpp8u *)calloc(oBufferSize, sizeof(Rpp8u));
+        memcpy(outputCopy, output, oBufferSize * sizeof(Rpp8u));
+
+        Rpp8u *outputTemp, *outputCopyTemp;
+        outputTemp = output;
+        outputCopyTemp = outputCopy;
+
+        for (int count = 0; count < dstDescPtr->n; count++)
+        {
+            Rpp8u *outputCopyTempR, *outputCopyTempG, *outputCopyTempB;
+            outputCopyTempR = outputCopyTemp;
+            outputCopyTempG = outputCopyTempR + dstDescPtr->strides.cStride;
+            outputCopyTempB = outputCopyTempG + dstDescPtr->strides.cStride;
+
+            for (int i = 0; i < dstDescPtr->h; i++)
+            {
+                for (int j = 0; j < dstDescPtr->w; j++)
+                {
+                    *outputTemp = *outputCopyTempR;
+                    outputTemp++;
+                    outputCopyTempR++;
+                    *outputTemp = *outputCopyTempG;
+                    outputTemp++;
+                    outputCopyTempG++;
+                    *outputTemp = *outputCopyTempB;
+                    outputTemp++;
+                    outputCopyTempB++;
+                }
+            }
+
+            outputCopyTemp += dstDescPtr->strides.nStride;
+        }
+
+        free(outputCopy);
+    }
+
+    rppDestroyHost(handle);
+
+    // OpenCV dump
+
+    mkdir(dst, 0700);
+    strcat(dst, "/");
+    count = 0;
+    elementsInRowMax = dstDescPtr->w * ip_channel;
+
+    for (j = 0; j < dstDescPtr->n; j++)
+    {
+        int height = roiTensorPtrSrc[j].xywhROI.roiHeight;
+        int width = roiTensorPtrSrc[j].xywhROI.roiWidth;
+
+        int op_size = height * width * ip_channel;
+        Rpp8u *temp_output = (Rpp8u *)calloc(op_size, sizeof(Rpp8u));
+        Rpp8u *temp_output_row;
+        temp_output_row = temp_output;
+        Rpp32u elementsInRow = width * ip_channel;
+        Rpp8u *output_row = output + count;
+
+        for (int k = 0; k < height; k++)
+        {
+            memcpy(temp_output_row, (output_row), elementsInRow * sizeof (Rpp8u));
+            temp_output_row += elementsInRow;
+            output_row += elementsInRowMax;
+        }
+        count += dstDescPtr->strides.nStride;
+
+        char temp[1000];
+        strcpy(temp, dst);
+        strcat(temp, imageNames[j]);
+
+        Mat mat_op_image;
+        mat_op_image = Mat(height, width, CV_8UC3, temp_output);
+        imwrite(temp, mat_op_image);
+
+        free(temp_output);
+    }
+
+    // Free memory
+
+    free(roiTensorPtrSrc);
+    free(roiTensorPtrDst);
+    free(input);
+    free(input_second);
+    free(output);
+    free(inputf16);
+    free(inputf16_second);
+    free(outputf16);
+    free(inputf32);
+    free(inputf32_second);
+    free(outputf32);
+    free(inputi8);
+    free(inputi8_second);
+    free(outputi8);
+
+    return 0;
+}
diff --git a/utilities/rpp-unittests/HOST_NEW/testAllScript.sh b/utilities/rpp-unittests/HOST_NEW/testAllScript.sh
index 302f0524f..10de9c954 100755
--- a/utilities/rpp-unittests/HOST_NEW/testAllScript.sh
+++ b/utilities/rpp-unittests/HOST_NEW/testAllScript.sh
@@ -196,6 +196,10 @@ do
 
             printf "\n./BatchPD_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
+            printf "\n./Tensor_host_pkd3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_host_pkd3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -252,6 +256,10 @@ do
 
             printf "\n./BatchPD_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
+            printf "\n./Tensor_host_pln1 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_host_pln1 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -308,6 +316,10 @@ do
 
             printf "\n./BatchPD_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
             ./BatchPD_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
+            printf "\n./Tensor_host_pln3 $SRC_FOLDER_1_TEMP $SRC_FOLDER_2_TEMP $DST_FOLDER_TEMP $bitDepth $outputFormatToggle $case 0"
+            ./Tensor_host_pln3 "$SRC_FOLDER_1_TEMP" "$SRC_FOLDER_2_TEMP" "$DST_FOLDER_TEMP" "$bitDepth" "$outputFormatToggle" "$case" "0"
+
             echo "------------------------------------------------------------------------------------------"
         done
     done
@@ -328,7 +340,7 @@ then
 
     printf "\n\nUsage: ./uniqueFunctionalities_host <u8 = 0 / f16 = 1 / f32 = 2 / u8->f16 = 3 / u8->f32 = 4 / i8 = 5 / u8->i8 = 6> <case number = 0:12>"
 
-    for ((case=0;case<13;case++))
+    for ((case=0;case<12;case++))
     do
         printf "\n\n\n\n" | tee -a "$DST_FOLDER/uniqueFunctionalities_host_log.txt"
         echo "--------------------------------" | tee -a "$DST_FOLDER/uniqueFunctionalities_host_log.txt"
diff --git a/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp b/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp
index 2f1f4bff5..dd3574ba0 100644
--- a/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp
+++ b/utilities/rpp-unittests/HOST_NEW/uniqueFunctionalities_host.cpp
@@ -109,6 +109,9 @@ int main(int argc, char **argv)
 
     printf("\nip_bitDepth = %d\ntest_case = %d", ip_bitDepth, test_case);
 
+    rppHandle_t handle;
+    rppCreate(&handle);
+
     clock_t start, end;
     double start_omp, end_omp;
     double cpu_time_used, omp_time_used;
@@ -122,54 +125,6 @@ int main(int argc, char **argv)
     {
         test_case_name = "tensor_transpose";
 
-        // Test Case 1
-        Rpp32u totalNumberOfElements = 36;
-        Rpp32u tensorDimension = 3;
-        Rpp32u tensorDimensionValues[3] = {3, 3, 4};
-        Rpp32u dimension1 = 0, dimension2 = 1;
-        Rpp8u srcPtr[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16};
-        Rpp8u dstPtr[36] = {0};
-
-        // Test Case 2
-        // Rpp32u totalNumberOfElements = 48;
-        // Rpp32u tensorDimension = 3;
-        // Rpp32u tensorDimensionValues[3] = {4, 4, 3};
-        // Rpp32u dimension1 = 0, dimension2 = 1;
-        // Rpp8u srcPtr[48] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 91, 95, 92, 98, 65, 66, 67, 68, 69, 70, 71, 72, 49, 47, 55, 51, 41, 39, 38, 34, 13, 24, 15, 16};
-        // Rpp8u dstPtr[48] = {0};
-
-        start = clock();
-        start_omp = omp_get_wtime();
-        if (ip_bitDepth == 0)
-            rppi_tensor_transpose_u8_host(srcPtr, dstPtr, dimension1, dimension2, tensorDimension, tensorDimensionValues);
-        else if (ip_bitDepth == 1)
-            missingFuncFlag = 1;
-        else if (ip_bitDepth == 2)
-            missingFuncFlag = 1;
-        else if (ip_bitDepth == 3)
-            missingFuncFlag = 1;
-        else if (ip_bitDepth == 4)
-            missingFuncFlag = 1;
-        else if (ip_bitDepth == 5)
-            missingFuncFlag = 1;
-        else if (ip_bitDepth == 6)
-            missingFuncFlag = 1;
-        else
-            missingFuncFlag = 1;
-        end_omp = omp_get_wtime();
-        end = clock();
-
-        printf("\n\nInput:\n");
-        displayTensor(srcPtr, totalNumberOfElements);
-        printf("\n\nOutput of tensor_transpose:\n");
-        displayTensor(dstPtr, totalNumberOfElements);
-
-        break;
-    }
-    case 1:
-    {
-        test_case_name = "transpose";
-
         // Test Case 1
         // Rpp32u totalNumberOfElements = 24;
         // Rpp32u perm[4] = {0, 3, 1, 2};
@@ -185,9 +140,9 @@ int main(int argc, char **argv)
         Rpp32u perm[4] = {0, 3, 1, 2};
         Rpp32u shape[4] = {2, 4, 5, 3};
         Rpp8u srcPtr[120] = {
-            255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 5, 4, 3, 2, 1, 0, 
-            27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 115, 114, 113, 112, 111, 110, 
-            240, 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 140, 139, 138, 137, 136, 135, 
+            255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 5, 4, 3, 2, 1, 0,
+            27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 115, 114, 113, 112, 111, 110,
+            240, 239, 238, 237, 236, 235, 234, 233, 232, 231, 230, 229, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 140, 139, 138, 137, 136, 135,
             70, 69, 68, 67, 66, 65, 64, 63, 62, 61, 60, 59, 170, 169, 168, 167, 166, 165, 164, 163, 162, 161, 160, 159, 15, 14, 13, 12, 11, 10
         };
         Rpp8u dstPtr[120] = {0};
@@ -206,17 +161,17 @@ int main(int argc, char **argv)
         start = clock();
         start_omp = omp_get_wtime();
         if (ip_bitDepth == 0)
-            rppi_transpose_u8_host(srcPtr, dstPtr, perm, shape);
+            rppi_tensor_transpose_u8_host(srcPtr, dstPtr, shape, perm, handle);
         else if (ip_bitDepth == 1)
-            rppi_transpose_f16_host(srcPtr16f, dstPtr16f, perm, shape);
+            rppi_tensor_transpose_f16_host(srcPtr16f, dstPtr16f, shape, perm, handle);
         else if (ip_bitDepth == 2)
-            rppi_transpose_f32_host(srcPtr32f, dstPtr32f, perm, shape);
+            rppi_tensor_transpose_f32_host(srcPtr32f, dstPtr32f, shape, perm, handle);
         else if (ip_bitDepth == 3)
             missingFuncFlag = 1;
         else if (ip_bitDepth == 4)
             missingFuncFlag = 1;
         else if (ip_bitDepth == 5)
-            rppi_transpose_i8_host(srcPtr8s, dstPtr8s, perm, shape);
+            rppi_tensor_transpose_i8_host(srcPtr8s, dstPtr8s, shape, perm, handle);
         else if (ip_bitDepth == 6)
             missingFuncFlag = 1;
         else
@@ -260,15 +215,15 @@ int main(int argc, char **argv)
             missingFuncFlag = 1;
         else
             missingFuncFlag = 1;
-        
+
         break;
     }
-    case 2:
+    case 1:
     {
         test_case_name = "tensor_add";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr1[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16};
         Rpp8u srcPtr2[36] = {16, 15, 24, 13, 72, 71, 70, 69, 68, 67, 66, 65, 108, 100, 111, 127, 121, 113, 117, 126, 127, 128, 129, 130, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
 
@@ -318,12 +273,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 3:
+    case 2:
     {
         test_case_name = "tensor_subtract";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr1[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16};
         Rpp8u srcPtr2[36] = {16, 15, 24, 13, 72, 71, 70, 69, 68, 67, 66, 65, 108, 100, 111, 127, 121, 113, 117, 126, 127, 128, 129, 130, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
 
@@ -373,12 +328,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 4:
+    case 3:
     {
         test_case_name = "tensor_multiply";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr1[36] = {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 130, 129, 128, 127, 126, 117, 113, 121, 127, 111, 100, 108, 65, 66, 67, 68, 69, 70, 71, 72, 13, 24, 15, 16};
         Rpp8u srcPtr2[36] = {16, 15, 24, 13, 72, 71, 70, 69, 68, 67, 66, 65, 108, 100, 111, 127, 121, 113, 117, 126, 127, 128, 129, 130, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255};
 
@@ -428,12 +383,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 5:
+    case 4:
     {
         test_case_name = "tensor_matrix_multiply";
 
         rppHandle_t handle;
-        
+
         Rpp32u tensorDimensionValues1[2] = {3, 2};
         Rpp32u tensorDimensionValues2[2] = {2, 4};
 
@@ -484,12 +439,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 6:
+    case 5:
     {
         test_case_name = "min_max_loc";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16};
 
         RppiSize srcSize1Channel, srcSize3Channel;
@@ -544,12 +499,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 7:
+    case 6:
     {
         test_case_name = "mean_stddev";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16};
 
         RppiSize srcSize1Channel, srcSize3Channel;
@@ -603,17 +558,17 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 8:
+    case 7:
     {
         test_case_name = "control_flow";
 
         rppHandle_t handle;
-        
+
         bool b1 = true, b2 = false;
         bool b3 =  true;
         Rpp8u u1 = 120, u2 = 100;
         Rpp8u u3 = 20;
-        
+
         start = clock();
         start_omp = omp_get_wtime();
         rpp_bool_control_flow(b1, b2, &b3, RPP_SCALAR_OP_AND, handle );
@@ -638,10 +593,10 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 9:
+    case 8:
     {
         test_case_name = "histogram";
-        
+
         rppHandle_t handle;
         int count = 0;
 
@@ -650,7 +605,7 @@ int main(int argc, char **argv)
         RppiSize srcSize;
         Rpp32u *outputHistogram = (Rpp32u *) calloc (bins, sizeof(Rpp32u));
         Rpp32u *outputHistogramTemp;
-        
+
         memset(outputHistogram, 0, bins * sizeof(Rpp32u));
         srcSize.height = 6;
         srcSize.width = 6;
@@ -734,12 +689,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 10:
+    case 9:
     {
         test_case_name = "convert_bit_depth";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16};
         Rpp8s dstPtr8s[36];
         Rpp16u dstPtr16u[36];
@@ -842,12 +797,12 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 11:
+    case 10:
     {
         test_case_name = "tensor_convert_bit_depth";
 
         rppHandle_t handle;
-        
+
         Rpp8u srcPtr[36] = {255, 130, 65, 254, 129, 66, 253, 128, 67, 252, 127, 68, 251, 126, 69, 250, 117, 70, 249, 113, 71, 248, 121, 72, 247, 127, 13, 246, 111, 24, 245, 100, 15, 244, 108, 16};
         Rpp8s dstPtr8s[36];
         Rpp16u dstPtr16u[36];
@@ -898,7 +853,7 @@ int main(int argc, char **argv)
 
         break;
     }
-    case 12:
+    case 11:
     {
         test_case_name = "tensor_look_up_table";