-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Audio HIP PR3 - Downmixing HIP Support #261
Changes from all commits
c33af22
14f6334
95c3272
3973c34
3f08f90
b9c0788
641f653
5568573
a6749ba
a255906
d3df761
ebecb42
fc1410b
cb3d539
7fe2a0e
e80533a
50a743f
fe06106
3ebd7c3
76f31df
b83f910
ebeb131
7c194b2
cb06f7f
0e51993
004e1d6
b8f5c60
1147bfe
9d48447
5e3fc7a
b6b7cc5
e16ad7a
9394c78
06263a5
a7e71a3
77e14ef
34f3f6d
ab52683
ee0d6fe
2decd32
30ce1d6
64ae74f
1a3015c
b926816
7cb3c03
4cb8d4b
8aab10c
9d68c49
195e4a4
807bbe8
b8264f7
5cf7360
d62e9cb
239bf76
fd21921
23072da
3b00c64
0c71365
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
/* | ||
MIT License | ||
|
||
Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. | ||
*/ | ||
|
||
#ifndef HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP | ||
#define HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP | ||
|
||
#include "kernel/down_mixing.hpp" | ||
|
||
#endif // HIP_TENSOR_AUDIO_AUGMENTATIONS_HPP |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#include <hip/hip_runtime.h> | ||
#include "rpp_hip_common.hpp" | ||
|
||
__global__ void down_mixing_hip_tensor(float *srcPtr, | ||
uint srcStride, | ||
float *dstPtr, | ||
uint dstStride, | ||
int2 *srcDimsTensor) | ||
|
||
{ | ||
int id_x = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; | ||
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z; | ||
int srcLength = srcDimsTensor[id_z].x; | ||
int channels = srcDimsTensor[id_z].y; | ||
|
||
if (id_x >= srcLength) | ||
return; | ||
|
||
float outVal = 0.0f; | ||
uint srcIdx = id_z * srcStride + id_x * channels; | ||
int i = 0; | ||
int alignedChannels = (channels / 8) * 8; | ||
|
||
// do 8 pixel load till alignedChannels value | ||
if (alignedChannels) | ||
{ | ||
d_float8 outVal_f8; | ||
outVal_f8.f4[0] = static_cast<float4>(0.0f); | ||
outVal_f8.f4[1] = outVal_f8.f4[0]; | ||
for(; i < alignedChannels; i += 8, srcIdx += 8) | ||
{ | ||
d_float8 src_f8; | ||
rpp_hip_load8_and_unpack_to_float8(srcPtr + srcIdx, &src_f8); | ||
rpp_hip_math_add8(&outVal_f8, &src_f8, &outVal_f8); | ||
} | ||
outVal_f8.f4[0] += outVal_f8.f4[1]; | ||
outVal += (outVal_f8.f1[0] + outVal_f8.f1[1] + outVal_f8.f1[2] + outVal_f8.f1[3]); | ||
} | ||
// process remaining channels | ||
for(; i < channels; i++, srcIdx++) | ||
outVal += srcPtr[srcIdx]; | ||
outVal *= (1.f / channels); | ||
|
||
uint dstIdx = id_z * dstStride + id_x; | ||
dstPtr[dstIdx] = outVal; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the range of values channels we can have? Any ways to vectorize the write here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see that the RPPT_MAX_AUDIO_CHANNELS = 16 in the Resample PR. If thats the case, possibly running the 1 liner un-vectorized loop on L43 to do a "outVal += srcPtr[srcIdx]" should be fine followed by doing a "outVal *= nomalizedWeight" at the end.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tried this experiment and it is giving a good 30% improvement in performance Previously Currently we have only 2 channel input in test suite and it goes to non vectorized loop, so it is loading 16 elements per thread and giving a 30% boost compared to unvectorized writes case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this performance improvement was an anomoly Reran multiple times today and confirmed there was a degradation in performance |
||
} | ||
|
||
RppStatus hip_exec_down_mixing_tensor(Rpp32f *srcPtr, | ||
RpptDescPtr srcDescPtr, | ||
Rpp32f *dstPtr, | ||
RpptDescPtr dstDescPtr, | ||
Rpp32s *srcDimsTensor, | ||
bool normalizeWeights, | ||
rpp::Handle& handle) | ||
{ | ||
Rpp32s globalThreads_x = dstDescPtr->strides.nStride; | ||
Rpp32s globalThreads_y = 1; | ||
Rpp32s globalThreads_z = dstDescPtr->n; | ||
|
||
hipLaunchKernelGGL(down_mixing_hip_tensor, | ||
dim3(ceil((Rpp32f)globalThreads_x/LOCAL_THREADS_X_1DIM), ceil((Rpp32f)globalThreads_y/LOCAL_THREADS_Y_1DIM), ceil((Rpp32f)globalThreads_z/LOCAL_THREADS_Z_1DIM)), | ||
dim3(LOCAL_THREADS_X_1DIM, LOCAL_THREADS_Y_1DIM, LOCAL_THREADS_Z_1DIM), | ||
0, | ||
handle.GetStream(), | ||
srcPtr, | ||
srcDescPtr->strides.nStride, | ||
dstPtr, | ||
dstDescPtr->strides.nStride, | ||
reinterpret_cast<int2 *>(srcDimsTensor)); | ||
|
||
return RPP_SUCCESS; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add an empty line at the end of the file. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this not vectorizeed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can only vectorize the channel code as how it is done in host code
Initially felt it might not be efficient to vectorize the channel loop in HIP
But explored further and vectorized the code now and checked the performance. It looks fine
Could not see improvement in performance though with current inputs
since we are testing with 2 channel input and vectorization only works with at least 8 channel input