Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement urKernelGetSuggestedLocalWorkSize #1385

Merged
merged 63 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
4628542
add name to queue.yml
yingcong-wu Feb 26, 2024
27ea151
save
yingcong-wu Feb 27, 2024
9672f70
add impls
yingcong-wu Mar 4, 2024
bb0e10d
fix build problem
yingcong-wu Mar 5, 2024
18d1171
fix a typo
yingcong-wu Mar 6, 2024
941de5a
move the api to kernel.yml
yingcong-wu Mar 8, 2024
82188b3
move impls to kernel.cpp
yingcong-wu Mar 11, 2024
069ceb7
address comment
yingcong-wu Mar 11, 2024
b0a2c83
fix cuda build problem
yingcong-wu Mar 11, 2024
79b086c
fix hip build problem
yingcong-wu Mar 11, 2024
dab069f
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 11, 2024
77133ca
fix l0 build problem
yingcong-wu Mar 11, 2024
3fec2ee
address comment
yingcong-wu Mar 12, 2024
03ab9a8
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 12, 2024
7ad31b6
address comment
yingcong-wu Mar 12, 2024
e5bcf84
fix l0 build problem
yingcong-wu Mar 12, 2024
276f738
change word in kernel.yml
yingcong-wu Mar 12, 2024
2fd576b
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 18, 2024
f174475
add test
yingcong-wu Mar 18, 2024
407ed02
add test
yingcong-wu Mar 18, 2024
a1c9f0f
add test
yingcong-wu Mar 18, 2024
4ce852b
merge
yingcong-wu Mar 19, 2024
d77d7c1
remove unwanted code
yingcong-wu Mar 20, 2024
d9883cd
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 20, 2024
6e887ef
adapt the new cuda/hip implement
yingcong-wu Mar 20, 2024
57ff11e
ignore unused var
yingcong-wu Mar 20, 2024
8799d8e
fix head error(is this a bug for other pr?)
yingcong-wu Mar 20, 2024
0248d87
add an error code
yingcong-wu Mar 20, 2024
9850c12
add check
yingcong-wu Mar 20, 2024
0fa8dbc
change error code
yingcong-wu Mar 22, 2024
92c8383
remove comment
yingcong-wu Mar 26, 2024
ee6ec18
unsupported for opencl for now
yingcong-wu Mar 26, 2024
17e4981
Revert "change error code"
yingcong-wu Mar 26, 2024
a39299f
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 26, 2024
f8f48c6
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 27, 2024
74c9213
remove passed test
yingcong-wu Mar 28, 2024
557009a
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Mar 28, 2024
feee43f
catch up on latest changes
yingcong-wu Mar 28, 2024
88d4066
update header
yingcong-wu Mar 28, 2024
d3c5693
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 2, 2024
f28510b
switch to maybe_unused
yingcong-wu Apr 7, 2024
6a429b5
remove unwanted check
yingcong-wu Apr 7, 2024
0ab11d2
add check, change error code to more precise one
yingcong-wu Apr 7, 2024
f2d0a59
remove unused code
yingcong-wu Apr 7, 2024
d148c9c
add dummy impl for native_cpu
yingcong-wu Apr 7, 2024
cb03acf
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 7, 2024
d9259b3
add native_cpu match lines
yingcong-wu Apr 7, 2024
7bff79d
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 11, 2024
b527309
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 15, 2024
992fdc3
not use DIE_NO_IMPLEMENTATION
yingcong-wu Apr 17, 2024
f3e11f1
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 23, 2024
9a2caa1
address comment
yingcong-wu Apr 23, 2024
7dc7a73
address comment
yingcong-wu Apr 24, 2024
145fcd5
fix wrong skip
yingcong-wu Apr 24, 2024
bbd06fc
fix ci
yingcong-wu Apr 24, 2024
5b8a529
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 24, 2024
1ca4949
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu Apr 30, 2024
540d6ce
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu May 6, 2024
68eb2ea
Merge remote-tracking branch 'origin/main' into yc/new-api-suggestgro…
yingcong-wu May 13, 2024
4beaad7
Merge branch 'main' into yc/new-api-suggestgroupsize
yingcong-wu May 20, 2024
a04e74d
Merge remote-tracking branch 'origin/main' into yc/new-api-suggestgro…
kbenzie Jun 3, 2024
bbf7931
Fix OpenCL build
kbenzie Jun 3, 2024
5593d84
Fix CUDA build
kbenzie Jun 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ typedef enum ur_function_t {
UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 218, ///< Enumerator for ::urCommandBufferGetInfoExp
UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 219, ///< Enumerator for ::urCommandBufferCommandGetInfoExp
UR_FUNCTION_DEVICE_GET_SELECTED = 220, ///< Enumerator for ::urDeviceGetSelected
UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE = 223, ///< Enumerator for ::urQueueGetSuggestedLocalWorkSize
/// @cond
UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
/// @endcond
Expand Down Expand Up @@ -5514,6 +5515,49 @@ urQueueFlush(
ur_queue_handle_t hQueue ///< [in] handle of the queue to be flushed.
);

///////////////////////////////////////////////////////////////////////////////
/// @brief Get the suggested local work-item number from runtime implementation.
///
/// @details
/// - pLocalWorkSize can be omitted in urEnqueueKernelLaunch(), but beside
/// from
/// - OpenCL, LocalWorkSize will need to be calculated or guessed before
/// enqueue
/// - the kernel. This function will get the LocalWorkSize value used when
/// enqueueing
/// - the kernel.
/// - TODO: find a better place for this function
///
/// @returns
/// - ::UR_RESULT_SUCCESS
/// - ::UR_RESULT_ERROR_UNINITIALIZED
/// - ::UR_RESULT_ERROR_DEVICE_LOST
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hQueue`
/// + `NULL == hKernel`
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pGlobalWorkOffset`
/// + `NULL == pGlobalWorkSize`
/// + `NULL == pSuggestedLocalWorkSize`
/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
UR_APIEXPORT ur_result_t UR_APICALL
urQueueGetSuggestedLocalWorkSize(
ur_queue_handle_t hQueue, ///< [in] handle of the queue object
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel.
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
///< and work-group work-items
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify
///< the offset used to calculate the global ID of a work-item"
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify
///< the number of global work-items in workDim that will execute the
///< kernel function
size_t *pSuggestedLocalWorkSize ///< [out] pointer to an array of workDim unsigned values that specify
///< the number of local work-items forming a work-group that will
///< execute the kernel function.
);

#if !defined(__GNUC__)
#pragma endregion
#endif
Expand Down Expand Up @@ -9768,6 +9812,19 @@ typedef struct ur_queue_flush_params_t {
ur_queue_handle_t *phQueue;
} ur_queue_flush_params_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for urQueueGetSuggestedLocalWorkSize
/// @details Each entry is a pointer to the parameter passed to the function;
/// allowing the callback the ability to modify the parameter's value
typedef struct ur_queue_get_suggested_local_work_size_params_t {
ur_queue_handle_t *phQueue;
ur_kernel_handle_t *phKernel;
uint32_t *pworkDim;
const size_t **ppGlobalWorkOffset;
const size_t **ppGlobalWorkSize;
size_t **ppSuggestedLocalWorkSize;
} ur_queue_get_suggested_local_work_size_params_t;

///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for urSamplerCreate
/// @details Each entry is a pointer to the parameter passed to the function;
Expand Down
11 changes: 11 additions & 0 deletions include/ur_ddi.h
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueFinish_t)(
typedef ur_result_t(UR_APICALL *ur_pfnQueueFlush_t)(
ur_queue_handle_t);

///////////////////////////////////////////////////////////////////////////////
/// @brief Function-pointer for urQueueGetSuggestedLocalWorkSize
typedef ur_result_t(UR_APICALL *ur_pfnQueueGetSuggestedLocalWorkSize_t)(
ur_queue_handle_t,
ur_kernel_handle_t,
uint32_t,
const size_t *,
const size_t *,
size_t *);

///////////////////////////////////////////////////////////////////////////////
/// @brief Table of Queue functions pointers
typedef struct ur_queue_dditable_t {
Expand All @@ -722,6 +732,7 @@ typedef struct ur_queue_dditable_t {
ur_pfnQueueCreateWithNativeHandle_t pfnCreateWithNativeHandle;
ur_pfnQueueFinish_t pfnFinish;
ur_pfnQueueFlush_t pfnFlush;
ur_pfnQueueGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
} ur_queue_dditable_t;

///////////////////////////////////////////////////////////////////////////////
Expand Down
8 changes: 8 additions & 0 deletions include/ur_print.h
Original file line number Diff line number Diff line change
Expand Up @@ -1538,6 +1538,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueFinishParams(const struct ur_que
/// - `buff_size < out_size`
UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueFlushParams(const struct ur_queue_flush_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);

///////////////////////////////////////////////////////////////////////////////
/// @brief Print ur_queue_get_suggested_local_work_size_params_t struct
/// @returns
/// - ::UR_RESULT_SUCCESS
/// - ::UR_RESULT_ERROR_INVALID_SIZE
/// - `buff_size < out_size`
UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueGetSuggestedLocalWorkSizeParams(const struct ur_queue_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);

///////////////////////////////////////////////////////////////////////////////
/// @brief Print ur_sampler_create_params_t struct
/// @returns
Expand Down
49 changes: 49 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
case UR_FUNCTION_DEVICE_GET_SELECTED:
os << "UR_FUNCTION_DEVICE_GET_SELECTED";
break;
case UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE:
os << "UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE";
break;
default:
os << "unknown enumerator";
break;
Expand Down Expand Up @@ -11607,6 +11610,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
return os;
}

///////////////////////////////////////////////////////////////////////////////
/// @brief Print operator for the ur_queue_get_suggested_local_work_size_params_t type
/// @returns
/// std::ostream &
inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_queue_get_suggested_local_work_size_params_t *params) {

os << ".hQueue = ";

ur::details::printPtr(os,
*(params->phQueue));

os << ", ";
os << ".hKernel = ";

ur::details::printPtr(os,
*(params->phKernel));

os << ", ";
os << ".workDim = ";

os << *(params->pworkDim);

os << ", ";
os << ".pGlobalWorkOffset = ";

ur::details::printPtr(os,
*(params->ppGlobalWorkOffset));

os << ", ";
os << ".pGlobalWorkSize = ";

ur::details::printPtr(os,
*(params->ppGlobalWorkSize));

os << ", ";
os << ".pSuggestedLocalWorkSize = ";

ur::details::printPtr(os,
*(params->ppSuggestedLocalWorkSize));

return os;
}

///////////////////////////////////////////////////////////////////////////////
/// @brief Print operator for the ur_sampler_create_params_t type
/// @returns
Expand Down Expand Up @@ -16793,6 +16839,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
case UR_FUNCTION_QUEUE_FLUSH: {
os << (const struct ur_queue_flush_params_t *)params;
} break;
case UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE: {
os << (const struct ur_queue_get_suggested_local_work_size_params_t *)params;
} break;
case UR_FUNCTION_SAMPLER_CREATE: {
os << (const struct ur_sampler_create_params_t *)params;
} break;
Expand Down
46 changes: 46 additions & 0 deletions scripts/core/queue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,3 +332,49 @@ params:
returns:
- $X_RESULT_ERROR_INVALID_QUEUE
- $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
--- #--------------------------------------------------------------------------
type: function
desc: "Get the suggested local work-item number from runtime implementation."
class: $xQueue
name: GetSuggestedLocalWorkSize
ordinal: "0"
details:
- "pLocalWorkSize can be omitted in urEnqueueKernelLaunch(), but beside from"
- "OpenCL, LocalWorkSize will need to be calculated or guessed before enqueue"
- "the kernel. This function will get the LocalWorkSize value used when enqueueing"
- "the kernel."
- "TODO: find a better place for this function"
yingcong-wu marked this conversation as resolved.
Show resolved Hide resolved
params:
- type: $x_queue_handle_t
name: hQueue
desc: |
[in] handle of the queue object
yingcong-wu marked this conversation as resolved.
Show resolved Hide resolved
- type: "$x_kernel_handle_t"
name: hKernel
desc: |
[in] handle of the kernel.
- type: uint32_t
name: workDim
desc: |
[in] number of dimensions, from 1 to 3, to specify the global
and work-group work-items
- type: "const size_t*"
name: pGlobalWorkOffset
desc: |
[in] pointer to an array of workDim unsigned values that specify
the offset used to calculate the global ID of a work-item"
- type: "const size_t*"
name: pGlobalWorkSize
desc: |
[in] pointer to an array of workDim unsigned values that specify
the number of global work-items in workDim that will execute the
kernel function
- type: "size_t*"
name: pSuggestedLocalWorkSize
desc: |
[out] pointer to an array of workDim unsigned values that specify
the number of local work-items forming a work-group that will
execute the kernel function.
returns:
- $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
- $X_RESULT_ERROR_UNSUPPORTED_FEATURE;
3 changes: 3 additions & 0 deletions scripts/core/registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,9 @@ etors:
- name: DEVICE_GET_SELECTED
desc: Enumerator for $xDeviceGetSelected
value: '220'
- name: QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE
desc: Enumerator for $xQueueGetSuggestedLocalWorkSize
value: '223'
---
type: enum
desc: Defines structure types
Expand Down
36 changes: 36 additions & 0 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1756,3 +1756,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(

return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

UR_APIEXPORT ur_result_t UR_APICALL urQueueGetSuggestedLocalWorkSize(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
size_t *pSuggestedLocalWorkSize) {
ur_context_handle_t Context = hQueue->getContext();
ur_device_handle_t Device = hQueue->Device;
ur_result_t Result = UR_RESULT_SUCCESS;
size_t MaxWorkGroupSize = 0u;
size_t ThreadsPerBlock[3] = {};
size_t MaxThreadsPerBlock[3] = {};
uint32_t LocalSize = hKernel->getLocalSize();

try {
// Set the active context here as guessLocalWorkSize needs an active context
ScopedContext Active(Context);
{
MaxWorkGroupSize = Device->getMaxWorkGroupSize();
Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
MaxThreadsPerBlock);
guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
MaxThreadsPerBlock, hKernel, LocalSize);

if (MaxWorkGroupSize <
ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) {
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}
}

std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
pSuggestedLocalWorkSize);
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}
17 changes: 17 additions & 0 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1824,3 +1824,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

UR_APIEXPORT ur_result_t UR_APICALL urQueueGetSuggestedLocalWorkSize(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
size_t *pSuggestedLocalWorkSize) {
size_t MaxThreadsPerBlock[3] = {};
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};

MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX();
MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY();
MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ();
simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize, MaxThreadsPerBlock,
hKernel);
std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
pSuggestedLocalWorkSize);
return UR_RESULT_SUCCESS;
}
Loading