diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
index d34bdd1dd..4ffd24156 100644
--- a/include/rppt_tensor_arithmetic_operations.h
+++ b/include/rppt_tensor_arithmetic_operations.h
@@ -258,6 +258,40 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr
RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
+/*! \brief Logarithm operation on HOST backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HOST memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HOST memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+
+#ifdef GPU_SUPPORT
+/*! \brief Logarithm operation on HIP backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize()
+ * \return A \ref RppStatus enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
/*! @}
*/
diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp
index 44758b0b0..779f6f2d1 100644
--- a/src/include/cpu/rpp_cpu_common.hpp
+++ b/src/include/cpu/rpp_cpu_common.hpp
@@ -6516,6 +6516,12 @@ inline void compute_remap_src_loc(Rpp32f rowLoc, Rpp32f colLoc, Rpp32s &srcLoc,
srcLoc = (rowLoc * stride) + colLoc * channels;
}
+inline void compute_log_16_host(__m256 *p)
+{
+ p[0] = log_ps(p[0]); // log compute
+ p[1] = log_ps(p[1]); // log compute
+}
+
inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst)
{
__m256 tmp0, tmp1, tmp2, tmp3;
diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
index 3f32dbc04..e6e2ab986 100644
--- a/src/include/hip/rpp_hip_common.hpp
+++ b/src/include/hip/rpp_hip_common.hpp
@@ -1869,6 +1869,21 @@ __device__ __forceinline__ float rpp_hip_math_sinc(float x)
return (fabsf(x) < 1e-5f) ? (1.0f - x * x * ONE_OVER_6) : sinf(x) / x;
}
+__device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst_f8)
+{
+ for(int i = 0; i < 8; i++)
+ src_f8->f1[i] = (!src_f8->f1[i]) ? std::nextafter(0.0f, 1.0f) : fabsf(src_f8->f1[i]);
+
+ dst_f8->f1[0] = __logf(src_f8->f1[0]);
+ dst_f8->f1[1] = __logf(src_f8->f1[1]);
+ dst_f8->f1[2] = __logf(src_f8->f1[2]);
+ dst_f8->f1[3] = __logf(src_f8->f1[3]);
+ dst_f8->f1[4] = __logf(src_f8->f1[4]);
+ dst_f8->f1[5] = __logf(src_f8->f1[5]);
+ dst_f8->f1[6] = __logf(src_f8->f1[6]);
+ dst_f8->f1[7] = __logf(src_f8->f1[7]);
+}
+
// /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/
template
diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
index b98145be0..466e51e09 100644
--- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp
+++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
#include "kernel/subtract_scalar.hpp"
#include "kernel/multiply_scalar.hpp"
#include "kernel/magnitude.hpp"
+#include "kernel/log.hpp"
#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/log.hpp b/src/modules/cpu/kernel/log.hpp
new file mode 100644
index 000000000..5ec79b21c
--- /dev/null
+++ b/src/modules/cpu/kernel/log.hpp
@@ -0,0 +1,563 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_common.hpp"
+
+// 1 pixel log helper functions
+// NOTE: log(0) leads to undefined thus using nextafter() to avoid this result
+// Also negative values are converted to positive by taking absolute of inputs
+inline void compute_log(Rpp8u *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src); }
+inline void compute_log(Rpp8s *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src + 128); }
+inline void compute_log(Rpp16f *src, Rpp16f *dst) { *dst = (!*src) ? log(std::nextafter(0.0f, 1.0f)) : log(abs(*src)); }
+inline void compute_log(Rpp32f *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(abs(*src)); }
+
+// Computes ND log recursively
+template
+void log_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim)
+{
+ if (!nDim)
+ compute_log(src, dst);
+ else
+ {
+ for (int i = 0; i < *dstShape; i++)
+ {
+ log_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1);
+ dst += *dstStrides;
+ src += *srcStrides;
+ }
+ }
+}
+
+RppStatus log_generic_host_tensor(Rpp8u *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp8u *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ alignedLength = length[0] & ~15;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8u *srcPtrTemp = srcPtr1;
+ Rpp32f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8u *srcPtrRow = srcPtr1;
+ Rpp32f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp8u *srcPtrTemp = srcPtrRow;
+ Rpp32f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp8s *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp8s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ alignedLength = length[0] & ~15;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8s *srcPtrTemp = srcPtr1;
+ Rpp32f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp8s *srcPtrRow = srcPtr1;
+ Rpp32f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp8s *srcPtrTemp = srcPtrRow;
+ Rpp32f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp32f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp32f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp32f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ alignedLength = length[0] & ~15;
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtr1, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrTemp = srcPtr1;
+ Rpp32f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp32f *srcPtrRow = srcPtr1;
+ Rpp32f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp32f *srcPtrTemp = srcPtrRow;
+ Rpp32f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ __m256 p[2];
+
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
+
+RppStatus log_generic_host_tensor(Rpp16f *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ Rpp16f *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numThreads = handle.GetNumThreads();
+ Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+ Rpp32u batchSize = dstGenericDescPtr->dims[0];
+
+ omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+ for(int batchCount = 0; batchCount < batchSize; batchCount++)
+ {
+ Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+ Rpp32u *begin = roi;
+ Rpp32u *length = &roi[nDim];
+
+ Rpp16f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+ Rpp16f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+ for(int i = 0; i < nDim; i++)
+ srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+ Rpp32u alignedLength;
+ Rpp32u vectorIncrement = 16;
+ if (nDim == 1)
+ {
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[16];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtr1[cnt]);
+
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtr1, p); // simd stores
+ srcPtr1 += vectorIncrement;
+ dstPtr1 += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[0]; vectorLoopCount++)
+ {
+ compute_log(srcPtr1, dstPtr1);
+ srcPtr1++;
+ dstPtr1++;
+ }
+ }
+ else if(nDim == 2)
+ {
+ alignedLength = length[1] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp16f *srcPtrTemp = srcPtr1;
+ Rpp16f *dstPtrTemp = dstPtr1;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[16];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[1]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else if(nDim == 3)
+ {
+ alignedLength = length[2] & ~15;
+ for(int i = 0; i < length[0]; i++)
+ {
+ Rpp16f *srcPtrRow = srcPtr1;
+ Rpp16f *dstPtrRow = dstPtr1;
+
+ for(int j = 0; j < length[1]; j++)
+ {
+ Rpp16f *srcPtrTemp = srcPtrRow;
+ Rpp16f *dstPtrTemp = dstPtrRow;
+
+ int vectorLoopCount = 0;
+#if __AVX2__
+ for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+ {
+ Rpp32f srcPtrTemp_ps[16];
+ for(int cnt = 0; cnt < vectorIncrement; cnt++)
+ srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]);
+
+ __m256 p[2];
+ rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
+ compute_log_16_host(p); // log compute
+ rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores
+ srcPtrTemp += vectorIncrement;
+ dstPtrTemp += vectorIncrement;
+ }
+#endif
+ for (; vectorLoopCount < length[2]; vectorLoopCount++)
+ {
+ compute_log(srcPtrTemp, dstPtrTemp);
+ srcPtrTemp++;
+ dstPtrTemp++;
+ }
+ srcPtrRow += srcGenericDescPtr->strides[2];
+ dstPtrRow += dstGenericDescPtr->strides[2];
+ }
+ srcPtr1 += srcGenericDescPtr->strides[1];
+ dstPtr1 += dstGenericDescPtr->strides[1];
+ }
+ }
+ else
+ log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
index 37d2220b2..59e4ba3f9 100644
--- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp
+++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp
@@ -30,5 +30,6 @@ SOFTWARE.
#include "kernel/subtract_scalar.hpp"
#include "kernel/multiply_scalar.hpp"
#include "kernel/magnitude.hpp"
+#include "kernel/log.hpp"
#endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/hip/kernel/log.hpp b/src/modules/hip/kernel/log.hpp
new file mode 100644
index 000000000..a481a1e07
--- /dev/null
+++ b/src/modules/hip/kernel/log.hpp
@@ -0,0 +1,232 @@
+#include
+#include "rpp_hip_common.hpp"
+
+// -------------------- Set 1 - helper kernels --------------------
+template
+__device__ void log_hip_compute(T *srcPtr, d_float8 *src_f8, d_float8 *dst_f8)
+{
+ if constexpr (std::is_same::value)
+ rpp_hip_math_add8_const(src_f8, src_f8, (float4)128);
+
+ rpp_hip_math_log(src_f8, dst_f8);
+}
+
+// -------------------- Set 2 - log kernels --------------------
+template
+__global__ void log_1d_hip_tensor(T *srcPtr,
+ uint srcStrides,
+ U *dstPtr,
+ uint dstStrides,
+ uint *roiTensor)
+{
+ uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ uint *roi = &roiTensor[id_z * 2];
+ uint beginX = roi[0];
+ uint width = roi[1];
+
+ if (id_x >= width)
+ return;
+
+ uint srcIdx = (id_z * srcStrides) + id_x + beginX;
+ uint dstIdx = (id_z * dstStrides) + id_x;
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void log_2d_hip_tensor(T *srcPtr,
+ uint2 srcStridesNH,
+ U *dstPtr,
+ uint2 dstStridesNH,
+ uint *roiTensor)
+{
+ uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ uint *roi = &roiTensor[id_z * 4];
+ uint beginY = roi[0];
+ uint beginX = roi[1];
+ uint height = roi[2];
+ uint width = roi[3];
+
+ if (id_x >= width || id_y >= height)
+ return;
+
+ uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + beginY) * srcStridesNH.y) + id_x + beginX;
+ uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x;
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void log_3d_hip_tensor(T *srcPtr,
+ uint2 srcStridesDH,
+ U *dstPtr,
+ uint2 dstStridesDH,
+ uint *roiTensor)
+{
+ uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX
+ uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY
+ uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ
+
+ uint *roi = roiTensor;
+ uint beginZ = roi[0];
+ uint beginY = roi[1];
+ uint beginX = roi[2];
+ uint lengthZ = roi[3];
+ uint lengthY = roi[4];
+ uint lengthX = roi[5];
+
+ if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ)
+ return;
+
+ uint srcIdx = ((id_z + beginZ) * srcStridesDH.x) + ((id_y + beginY) * srcStridesDH.y) + id_x + beginX;
+ uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x;
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+template
+__global__ void log_nd_hip_tensor(T *srcPtr,
+ uint *srcStrides,
+ uint *srcDims,
+ uint numDims,
+ U *dstPtr,
+ uint *dstStrides,
+ Rpp32u *roiTensor)
+{
+ int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
+ int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize
+
+ if(id_x >= srcStrides[0])
+ return;
+
+ uint *roi = roiTensor + id_z * numDims * 2;
+ uint *begin = roi;
+ uint *length = &roi[numDims];
+ uint dstIdx = (id_z * *dstStrides++);
+ uint srcIdx = (id_z * *srcStrides++);
+ uint coords[RPPT_MAX_DIMS];
+
+ for (int i = 0; i < numDims; i++)
+ {
+ coords[i] = (id_x / srcStrides[i]) % srcDims[i];
+ if(coords[i] >= length[i])
+ return;
+ }
+
+ for (int i = 0; i < numDims; i++)
+ {
+ dstIdx += (coords[i] * dstStrides[i]);
+ srcIdx += (begin[i] + (coords[i] * srcStrides[i]));
+ }
+
+ d_float8 src_f8, dst_f8;
+ rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8);
+ log_hip_compute(srcPtr, &src_f8, &dst_f8);
+ rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
+}
+
+// -------------------- Set 3 - executor kernels --------------------
+template
+RppStatus hip_exec_log_generic_tensor(T *srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ U *dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ uint *roiTensor,
+ rpp::Handle& handle)
+{
+ Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims
+ // based on number of dimensions call the corresponding kernel
+ if (numDims == 1)
+ {
+ // NW
+ int globalThreads_x = dstGenericDescPtr->dims[1];
+ int globalThreads_y = 1;
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(log_1d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcGenericDescPtr->strides[0],
+ dstPtr,
+ dstGenericDescPtr->strides[0],
+ roiTensor);
+ }
+ else if (numDims == 2)
+ {
+ // NHW
+ int globalThreads_x = dstGenericDescPtr->dims[2];
+ int globalThreads_y = dstGenericDescPtr->dims[1];
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(log_2d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]),
+ dstPtr,
+ make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]),
+ roiTensor);
+ }
+ else if (numDims == 3)
+ {
+ // NDHW
+ int globalThreads_x = dstGenericDescPtr->dims[3];
+ int globalThreads_y = dstGenericDescPtr->dims[2];
+ int globalThreads_z = dstGenericDescPtr->dims[1];
+
+ for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++)
+ {
+ hipLaunchKernelGGL(log_3d_hip_tensor,
+ dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)),
+ dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z),
+ 0,
+ handle.GetStream(),
+ srcPtr + (batchCount * srcGenericDescPtr->strides[0]),
+ make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]),
+ dstPtr + (batchCount * dstGenericDescPtr->strides[0]),
+ make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]),
+ &roiTensor[batchCount * 6]);
+ }
+ }
+ else
+ {
+ // interpret the input as 1D tensor
+ int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3;
+ int globalThreads_y = 1;
+ int globalThreads_z = dstGenericDescPtr->dims[0];
+
+ hipLaunchKernelGGL(log_nd_hip_tensor,
+ dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)),
+ dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM),
+ 0,
+ handle.GetStream(),
+ srcPtr,
+ srcGenericDescPtr->strides,
+ srcGenericDescPtr->dims + 1,
+ srcGenericDescPtr->numDims - 1,
+ dstPtr,
+ dstGenericDescPtr->strides,
+ roiTensor);
+ }
+
+ return RPP_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp
index 8f88ba90f..bac68a4a1 100644
--- a/src/modules/rppt_tensor_arithmetic_operations.cpp
+++ b/src/modules/rppt_tensor_arithmetic_operations.cpp
@@ -255,6 +255,57 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1,
return RPP_SUCCESS;
}
+/******************** log ********************/
+
+RppStatus rppt_log_host(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+}
+
/********************************************************************************************************************/
/*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/
/********************************************************************************************************************/
@@ -454,4 +505,59 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1,
#endif // backend
}
+/******************** log ********************/
+
+RppStatus rppt_log_gpu(RppPtr_t srcPtr,
+ RpptGenericDescPtr srcGenericDescPtr,
+ RppPtr_t dstPtr,
+ RpptGenericDescPtr dstGenericDescPtr,
+ Rpp32u *roiTensor,
+ rppHandle_t rppHandle)
+{
+#ifdef HIP_COMPILE
+ if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE;
+ else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16))
+ {
+ hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes),
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+ else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32))
+ {
+ hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes,
+ srcGenericDescPtr,
+ reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes),
+ dstGenericDescPtr,
+ roiTensor,
+ rpp::deref(rppHandle));
+ }
+
+ return RPP_SUCCESS;
+#elif defined(OCL_COMPILE)
+ return RPP_ERROR_NOT_IMPLEMENTED;
+#endif // backend
+}
+
#endif // GPU_SUPPORT
diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
index 763dc91ae..cb0d53b34 100644
--- a/utilities/test_suite/HIP/Tensor_misc_hip.cpp
+++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv)
if (argc < MIN_ARG_COUNT)
{
printf("\nImproper Usage! Needs all arguments!\n");
- printf("\nUsage: ./Tensor_misc_hip