diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h index d34bdd1dd..4ffd24156 100644 --- a/include/rppt_tensor_arithmetic_operations.h +++ b/include/rppt_tensor_arithmetic_operations.h @@ -258,6 +258,40 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr sr RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle); #endif // GPU_SUPPORT +/*! \brief Logarithm operation on HOST backend + * \details Computes Log to base e(natural log) of the input for a given ND Tensor. + * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. + * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result. + * \param [in] srcPtr source tensor in HOST memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor in HOST memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HOST handle created with \ref rppCreateWithBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle); + +#ifdef GPU_SUPPORT +/*! \brief Logarithm operation on HIP backend + * \details Computes Log to base e(natural log) of the input for a given ND Tensor. + * Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes. + * Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result. + * \param [in] srcPtr source tensor in HIP memory + * \param [in] srcGenericDescPtr source tensor descriptor + * \param [out] dstPtr destination tensor in HIP memory + * \param [in] dstGenericDescPtr destination tensor descriptor + * \param [in] roiTensor values to represent dimensions of input tensor + * \param [in] rppHandle RPP HIP handle created with \ref rppCreateWithStreamAndBatchSize() + * \return A \ref RppStatus enumeration. + * \retval RPP_SUCCESS Successful completion. + * \retval RPP_ERROR* Unsuccessful completion. + */ +RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle); +#endif // GPU_SUPPORT + /*! @} */ diff --git a/src/include/cpu/rpp_cpu_common.hpp b/src/include/cpu/rpp_cpu_common.hpp index 44758b0b0..779f6f2d1 100644 --- a/src/include/cpu/rpp_cpu_common.hpp +++ b/src/include/cpu/rpp_cpu_common.hpp @@ -6516,6 +6516,12 @@ inline void compute_remap_src_loc(Rpp32f rowLoc, Rpp32f colLoc, Rpp32s &srcLoc, srcLoc = (rowLoc * stride) + colLoc * channels; } +inline void compute_log_16_host(__m256 *p) +{ + p[0] = log_ps(p[0]); // log compute + p[1] = log_ps(p[1]); // log compute +} + inline void compute_transpose4x8_avx(__m256 *pSrc, __m128 *pDst) { __m256 tmp0, tmp1, tmp2, tmp3; diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp index 3f32dbc04..e6e2ab986 100644 --- a/src/include/hip/rpp_hip_common.hpp +++ b/src/include/hip/rpp_hip_common.hpp @@ -1869,6 +1869,21 @@ __device__ __forceinline__ float rpp_hip_math_sinc(float x) return (fabsf(x) < 1e-5f) ? (1.0f - x * x * ONE_OVER_6) : sinf(x) / x; } +__device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst_f8) +{ + for(int i = 0; i < 8; i++) + src_f8->f1[i] = (!src_f8->f1[i]) ? std::nextafter(0.0f, 1.0f) : fabsf(src_f8->f1[i]); + + dst_f8->f1[0] = __logf(src_f8->f1[0]); + dst_f8->f1[1] = __logf(src_f8->f1[1]); + dst_f8->f1[2] = __logf(src_f8->f1[2]); + dst_f8->f1[3] = __logf(src_f8->f1[3]); + dst_f8->f1[4] = __logf(src_f8->f1[4]); + dst_f8->f1[5] = __logf(src_f8->f1[5]); + dst_f8->f1[6] = __logf(src_f8->f1[6]); + dst_f8->f1[7] = __logf(src_f8->f1[7]); +} + // /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/ template diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp index b98145be0..466e51e09 100644 --- a/src/modules/cpu/host_tensor_arithmetic_operations.hpp +++ b/src/modules/cpu/host_tensor_arithmetic_operations.hpp @@ -30,5 +30,6 @@ SOFTWARE. #include "kernel/subtract_scalar.hpp" #include "kernel/multiply_scalar.hpp" #include "kernel/magnitude.hpp" +#include "kernel/log.hpp" #endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP diff --git a/src/modules/cpu/kernel/log.hpp b/src/modules/cpu/kernel/log.hpp new file mode 100644 index 000000000..5ec79b21c --- /dev/null +++ b/src/modules/cpu/kernel/log.hpp @@ -0,0 +1,563 @@ +/* +MIT License + +Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "rppdefs.h" +#include "rpp_cpu_common.hpp" + +// 1 pixel log helper functions +// NOTE: log(0) leads to undefined thus using nextafter() to avoid this result +// Also negative values are converted to positive by taking absolute of inputs +inline void compute_log(Rpp8u *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src); } +inline void compute_log(Rpp8s *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(*src + 128); } +inline void compute_log(Rpp16f *src, Rpp16f *dst) { *dst = (!*src) ? log(std::nextafter(0.0f, 1.0f)) : log(abs(*src)); } +inline void compute_log(Rpp32f *src, Rpp32f *dst) { *dst = (!*src) ? std::log(std::nextafter(0.0f, 1.0f)) : std::log(abs(*src)); } + +// Computes ND log recursively +template +void log_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim) +{ + if (!nDim) + compute_log(src, dst); + else + { + for (int i = 0; i < *dstShape; i++) + { + log_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1); + dst += *dstStrides; + src += *srcStrides; + } + } +} + +RppStatus log_generic_host_tensor(Rpp8u *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp8u *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + alignedLength = length[0] & ~15; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtr1, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8u *srcPtrTemp = srcPtr1; + Rpp32f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8u *srcPtrRow = srcPtr1; + Rpp32f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp8u *srcPtrTemp = srcPtrRow; + Rpp32f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_u8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} + +RppStatus log_generic_host_tensor(Rpp8s *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp8s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + alignedLength = length[0] & ~15; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtr1, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8s *srcPtrTemp = srcPtr1; + Rpp32f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp8s *srcPtrRow = srcPtr1; + Rpp32f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp8s *srcPtrTemp = srcPtrRow; + Rpp32f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_i8_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} + +RppStatus log_generic_host_tensor(Rpp32f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp32f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp32f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + alignedLength = length[0] & ~15; + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtr1, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrTemp = srcPtr1; + Rpp32f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp32f *srcPtrRow = srcPtr1; + Rpp32f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp32f *srcPtrTemp = srcPtrRow; + Rpp32f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + __m256 p[2]; + + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} + +RppStatus log_generic_host_tensor(Rpp16f *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + Rpp16f *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numThreads = handle.GetNumThreads(); + Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension. + Rpp32u batchSize = dstGenericDescPtr->dims[0]; + + omp_set_dynamic(0); +#pragma omp parallel for num_threads(numThreads) + for(int batchCount = 0; batchCount < batchSize; batchCount++) + { + Rpp32u *roi = roiTensor + batchCount * nDim * 2; + Rpp32u *begin = roi; + Rpp32u *length = &roi[nDim]; + + Rpp16f *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0]; + Rpp16f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0]; + + for(int i = 0; i < nDim; i++) + srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1]; + Rpp32u alignedLength; + Rpp32u vectorIncrement = 16; + if (nDim == 1) + { + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[16]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtr1[cnt]); + + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtr1, p); // simd stores + srcPtr1 += vectorIncrement; + dstPtr1 += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[0]; vectorLoopCount++) + { + compute_log(srcPtr1, dstPtr1); + srcPtr1++; + dstPtr1++; + } + } + else if(nDim == 2) + { + alignedLength = length[1] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp16f *srcPtrTemp = srcPtr1; + Rpp16f *dstPtrTemp = dstPtr1; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[16]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[1]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else if(nDim == 3) + { + alignedLength = length[2] & ~15; + for(int i = 0; i < length[0]; i++) + { + Rpp16f *srcPtrRow = srcPtr1; + Rpp16f *dstPtrRow = dstPtr1; + + for(int j = 0; j < length[1]; j++) + { + Rpp16f *srcPtrTemp = srcPtrRow; + Rpp16f *dstPtrTemp = dstPtrRow; + + int vectorLoopCount = 0; +#if __AVX2__ + for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement) + { + Rpp32f srcPtrTemp_ps[16]; + for(int cnt = 0; cnt < vectorIncrement; cnt++) + srcPtrTemp_ps[cnt] = static_cast(srcPtrTemp[cnt]); + + __m256 p[2]; + rpp_simd_load(rpp_load16_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads + compute_log_16_host(p); // log compute + rpp_simd_store(rpp_store16_f32_to_f16_avx, dstPtrTemp, p); // simd stores + srcPtrTemp += vectorIncrement; + dstPtrTemp += vectorIncrement; + } +#endif + for (; vectorLoopCount < length[2]; vectorLoopCount++) + { + compute_log(srcPtrTemp, dstPtrTemp); + srcPtrTemp++; + dstPtrTemp++; + } + srcPtrRow += srcGenericDescPtr->strides[2]; + dstPtrRow += dstGenericDescPtr->strides[2]; + } + srcPtr1 += srcGenericDescPtr->strides[1]; + dstPtr1 += dstGenericDescPtr->strides[1]; + } + } + else + log_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/hip/hip_tensor_arithmetic_operations.hpp b/src/modules/hip/hip_tensor_arithmetic_operations.hpp index 37d2220b2..59e4ba3f9 100644 --- a/src/modules/hip/hip_tensor_arithmetic_operations.hpp +++ b/src/modules/hip/hip_tensor_arithmetic_operations.hpp @@ -30,5 +30,6 @@ SOFTWARE. #include "kernel/subtract_scalar.hpp" #include "kernel/multiply_scalar.hpp" #include "kernel/magnitude.hpp" +#include "kernel/log.hpp" #endif // HIP_TENSOR_ARITHMETIC_OPERATIONS_HPP diff --git a/src/modules/hip/kernel/log.hpp b/src/modules/hip/kernel/log.hpp new file mode 100644 index 000000000..a481a1e07 --- /dev/null +++ b/src/modules/hip/kernel/log.hpp @@ -0,0 +1,232 @@ +#include +#include "rpp_hip_common.hpp" + +// -------------------- Set 1 - helper kernels -------------------- +template +__device__ void log_hip_compute(T *srcPtr, d_float8 *src_f8, d_float8 *dst_f8) +{ + if constexpr (std::is_same::value) + rpp_hip_math_add8_const(src_f8, src_f8, (float4)128); + + rpp_hip_math_log(src_f8, dst_f8); +} + +// -------------------- Set 2 - log kernels -------------------- +template +__global__ void log_1d_hip_tensor(T *srcPtr, + uint srcStrides, + U *dstPtr, + uint dstStrides, + uint *roiTensor) +{ + uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + uint *roi = &roiTensor[id_z * 2]; + uint beginX = roi[0]; + uint width = roi[1]; + + if (id_x >= width) + return; + + uint srcIdx = (id_z * srcStrides) + id_x + beginX; + uint dstIdx = (id_z * dstStrides) + id_x; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void log_2d_hip_tensor(T *srcPtr, + uint2 srcStridesNH, + U *dstPtr, + uint2 dstStridesNH, + uint *roiTensor) +{ + uint id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; // width + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // height + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + uint *roi = &roiTensor[id_z * 4]; + uint beginY = roi[0]; + uint beginX = roi[1]; + uint height = roi[2]; + uint width = roi[3]; + + if (id_x >= width || id_y >= height) + return; + + uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + beginY) * srcStridesNH.y) + id_x + beginX; + uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void log_3d_hip_tensor(T *srcPtr, + uint2 srcStridesDH, + U *dstPtr, + uint2 dstStridesDH, + uint *roiTensor) +{ + uint id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; // lengthX + uint id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y; // lengthY + uint id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // lengthZ + + uint *roi = roiTensor; + uint beginZ = roi[0]; + uint beginY = roi[1]; + uint beginX = roi[2]; + uint lengthZ = roi[3]; + uint lengthY = roi[4]; + uint lengthX = roi[5]; + + if (id_x >= lengthX || id_y >= lengthY || id_z >= lengthZ) + return; + + uint srcIdx = ((id_z + beginZ) * srcStridesDH.x) + ((id_y + beginY) * srcStridesDH.y) + id_x + beginX; + uint dstIdx = (id_z * dstStridesDH.x) + (id_y * dstStridesDH.y) + id_x; + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +template +__global__ void log_nd_hip_tensor(T *srcPtr, + uint *srcStrides, + uint *srcDims, + uint numDims, + U *dstPtr, + uint *dstStrides, + Rpp32u *roiTensor) +{ + int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8; + int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; // batchsize + + if(id_x >= srcStrides[0]) + return; + + uint *roi = roiTensor + id_z * numDims * 2; + uint *begin = roi; + uint *length = &roi[numDims]; + uint dstIdx = (id_z * *dstStrides++); + uint srcIdx = (id_z * *srcStrides++); + uint coords[RPPT_MAX_DIMS]; + + for (int i = 0; i < numDims; i++) + { + coords[i] = (id_x / srcStrides[i]) % srcDims[i]; + if(coords[i] >= length[i]) + return; + } + + for (int i = 0; i < numDims; i++) + { + dstIdx += (coords[i] * dstStrides[i]); + srcIdx += (begin[i] + (coords[i] * srcStrides[i])); + } + + d_float8 src_f8, dst_f8; + rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); + log_hip_compute(srcPtr, &src_f8, &dst_f8); + rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8); +} + +// -------------------- Set 3 - executor kernels -------------------- +template +RppStatus hip_exec_log_generic_tensor(T *srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + U *dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + uint *roiTensor, + rpp::Handle& handle) +{ + Rpp32u numDims = srcGenericDescPtr->numDims - 1; // exclude batchsize from input dims + // based on number of dimensions call the corresponding kernel + if (numDims == 1) + { + // NW + int globalThreads_x = dstGenericDescPtr->dims[1]; + int globalThreads_y = 1; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(log_1d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + srcGenericDescPtr->strides[0], + dstPtr, + dstGenericDescPtr->strides[0], + roiTensor); + } + else if (numDims == 2) + { + // NHW + int globalThreads_x = dstGenericDescPtr->dims[2]; + int globalThreads_y = dstGenericDescPtr->dims[1]; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(log_2d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr, + make_uint2(srcGenericDescPtr->strides[0], srcGenericDescPtr->strides[1]), + dstPtr, + make_uint2(dstGenericDescPtr->strides[0], dstGenericDescPtr->strides[1]), + roiTensor); + } + else if (numDims == 3) + { + // NDHW + int globalThreads_x = dstGenericDescPtr->dims[3]; + int globalThreads_y = dstGenericDescPtr->dims[2]; + int globalThreads_z = dstGenericDescPtr->dims[1]; + + for(int batchCount = 0; batchCount < dstGenericDescPtr->dims[0]; batchCount++) + { + hipLaunchKernelGGL(log_3d_hip_tensor, + dim3(ceil((float)globalThreads_x/LOCAL_THREADS_X), ceil((float)globalThreads_y/LOCAL_THREADS_Y), ceil((float)globalThreads_z/LOCAL_THREADS_Z)), + dim3(LOCAL_THREADS_X, LOCAL_THREADS_Y, LOCAL_THREADS_Z), + 0, + handle.GetStream(), + srcPtr + (batchCount * srcGenericDescPtr->strides[0]), + make_uint2(srcGenericDescPtr->strides[1], srcGenericDescPtr->strides[2]), + dstPtr + (batchCount * dstGenericDescPtr->strides[0]), + make_uint2(dstGenericDescPtr->strides[1], dstGenericDescPtr->strides[2]), + &roiTensor[batchCount * 6]); + } + } + else + { + // interpret the input as 1D tensor + int globalThreads_x = (dstGenericDescPtr->strides[0] + 7) >> 3; + int globalThreads_y = 1; + int globalThreads_z = dstGenericDescPtr->dims[0]; + + hipLaunchKernelGGL(log_nd_hip_tensor, + dim3(ceil((float)globalThreads_x/1024), ceil((float)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((float)globalThreads_z/LOCAL_THREADS_Z_1DIM)), + dim3(1024, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), + 0, + handle.GetStream(), + srcPtr, + srcGenericDescPtr->strides, + srcGenericDescPtr->dims + 1, + srcGenericDescPtr->numDims - 1, + dstPtr, + dstGenericDescPtr->strides, + roiTensor); + } + + return RPP_SUCCESS; +} \ No newline at end of file diff --git a/src/modules/rppt_tensor_arithmetic_operations.cpp b/src/modules/rppt_tensor_arithmetic_operations.cpp index 8f88ba90f..bac68a4a1 100644 --- a/src/modules/rppt_tensor_arithmetic_operations.cpp +++ b/src/modules/rppt_tensor_arithmetic_operations.cpp @@ -255,6 +255,57 @@ RppStatus rppt_magnitude_host(RppPtr_t srcPtr1, return RPP_SUCCESS; } +/******************** log ********************/ + +RppStatus rppt_log_host(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + log_generic_host_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + log_generic_host_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +} + /********************************************************************************************************************/ /*********************************************** RPP_GPU_SUPPORT = ON ***********************************************/ /********************************************************************************************************************/ @@ -454,4 +505,59 @@ RppStatus rppt_magnitude_gpu(RppPtr_t srcPtr1, #endif // backend } +/******************** log ********************/ + +RppStatus rppt_log_gpu(RppPtr_t srcPtr, + RpptGenericDescPtr srcGenericDescPtr, + RppPtr_t dstPtr, + RpptGenericDescPtr dstGenericDescPtr, + Rpp32u *roiTensor, + rppHandle_t rppHandle) +{ +#ifdef HIP_COMPILE + if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::U8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::I8)) return RPP_ERROR_INVALID_DST_DATATYPE; + else if ((srcGenericDescPtr->dataType == RpptDataType::U8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F16) && (dstGenericDescPtr->dataType == RpptDataType::F16)) + { + hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::F32) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_log_generic_tensor(reinterpret_cast(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes), + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + else if ((srcGenericDescPtr->dataType == RpptDataType::I8) && (dstGenericDescPtr->dataType == RpptDataType::F32)) + { + hip_exec_log_generic_tensor(static_cast(srcPtr) + srcGenericDescPtr->offsetInBytes, + srcGenericDescPtr, + reinterpret_cast(static_cast(dstPtr) + dstGenericDescPtr->offsetInBytes), + dstGenericDescPtr, + roiTensor, + rpp::deref(rppHandle)); + } + + return RPP_SUCCESS; +#elif defined(OCL_COMPILE) + return RPP_ERROR_NOT_IMPLEMENTED; +#endif // backend +} + #endif // GPU_SUPPORT diff --git a/utilities/test_suite/HIP/Tensor_misc_hip.cpp b/utilities/test_suite/HIP/Tensor_misc_hip.cpp index 763dc91ae..cb0d53b34 100644 --- a/utilities/test_suite/HIP/Tensor_misc_hip.cpp +++ b/utilities/test_suite/HIP/Tensor_misc_hip.cpp @@ -31,7 +31,7 @@ int main(int argc, char **argv) if (argc < MIN_ARG_COUNT) { printf("\nImproper Usage! Needs all arguments!\n"); - printf("\nUsage: ./Tensor_misc_hip