oneapi-src · kbenzie · Jun 4, 2024 · Feb 26, 2024 · Feb 27, 2024 · Mar 4, 2024
@@ -221,6 +221,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 218,                             ///< Enumerator for ::urCommandBufferGetInfoExp
     UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 219,                     ///< Enumerator for ::urCommandBufferCommandGetInfoExp
     UR_FUNCTION_DEVICE_GET_SELECTED = 220,                                     ///< Enumerator for ::urDeviceGetSelected
+    UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE = 223,                     ///< Enumerator for ::urQueueGetSuggestedLocalWorkSize
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -5514,6 +5515,49 @@ urQueueFlush(
     ur_queue_handle_t hQueue ///< [in] handle of the queue to be flushed.
 );
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work-item number from runtime implementation.
+///
+/// @details
+///     - pLocalWorkSize can be omitted in urEnqueueKernelLaunch(), but beside
+///       from
+///     - OpenCL, LocalWorkSize will need to be calculated or guessed before
+///       enqueue
+///     - the kernel. This function will get the LocalWorkSize value used when
+///       enqueueing
+///     - the kernel.
+///     - TODO: find a better place for this function
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///         + `NULL == hKernel`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+UR_APIEXPORT ur_result_t UR_APICALL
+urQueueGetSuggestedLocalWorkSize(
+    ur_queue_handle_t hQueue,        ///< [in] handle of the queue object
+    ur_kernel_handle_t hKernel,      ///< [in] handle of the kernel.
+    uint32_t workDim,                ///< [in] number of dimensions, from 1 to 3, to specify the global
+                                     ///< and work-group work-items
+    const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify
+                                     ///< the offset used to calculate the global ID of a work-item"
+    const size_t *pGlobalWorkSize,   ///< [in] pointer to an array of workDim unsigned values that specify
+                                     ///< the number of global work-items in workDim that will execute the
+                                     ///< kernel function
+    size_t *pSuggestedLocalWorkSize  ///< [out] pointer to an array of workDim unsigned values that specify
+                                     ///< the number of local work-items forming a work-group that will
+                                     ///< execute the kernel function.
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -9768,6 +9812,19 @@ typedef struct ur_queue_flush_params_t {
     ur_queue_handle_t *phQueue;
 } ur_queue_flush_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urQueueGetSuggestedLocalWorkSize
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_queue_get_suggested_local_work_size_params_t {
+    ur_queue_handle_t *phQueue;
+    ur_kernel_handle_t *phKernel;
+    uint32_t *pworkDim;
+    const size_t **ppGlobalWorkOffset;
+    const size_t **ppGlobalWorkSize;
+    size_t **ppSuggestedLocalWorkSize;
+} ur_queue_get_suggested_local_work_size_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urSamplerCreate
 /// @details Each entry is a pointer to the parameter passed to the function;

@@ -711,6 +711,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueFinish_t)(
 typedef ur_result_t(UR_APICALL *ur_pfnQueueFlush_t)(
     ur_queue_handle_t);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urQueueGetSuggestedLocalWorkSize
+typedef ur_result_t(UR_APICALL *ur_pfnQueueGetSuggestedLocalWorkSize_t)(
+    ur_queue_handle_t,
+    ur_kernel_handle_t,
+    uint32_t,
+    const size_t *,
+    const size_t *,
+    size_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Table of Queue functions pointers
 typedef struct ur_queue_dditable_t {
@@ -722,6 +732,7 @@ typedef struct ur_queue_dditable_t {
     ur_pfnQueueCreateWithNativeHandle_t pfnCreateWithNativeHandle;
     ur_pfnQueueFinish_t pfnFinish;
     ur_pfnQueueFlush_t pfnFlush;
+    ur_pfnQueueGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
 } ur_queue_dditable_t;
 
 ///////////////////////////////////////////////////////////////////////////////

@@ -1538,6 +1538,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueFinishParams(const struct ur_que
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueFlushParams(const struct ur_queue_flush_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_queue_get_suggested_local_work_size_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintQueueGetSuggestedLocalWorkSizeParams(const struct ur_queue_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_sampler_create_params_t struct
 /// @returns

@@ -912,6 +912,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_DEVICE_GET_SELECTED:
         os << "UR_FUNCTION_DEVICE_GET_SELECTED";
         break;
+    case UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE:
+        os << "UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -11607,6 +11610,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_queue_get_suggested_local_work_size_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_queue_get_suggested_local_work_size_params_t *params) {
+
+    os << ".hQueue = ";
+
+    ur::details::printPtr(os,
+                          *(params->phQueue));
+
+    os << ", ";
+    os << ".hKernel = ";
+
+    ur::details::printPtr(os,
+                          *(params->phKernel));
+
+    os << ", ";
+    os << ".workDim = ";
+
+    os << *(params->pworkDim);
+
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
+    os << ", ";
+    os << ".pGlobalWorkSize = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkSize));
+
+    os << ", ";
+    os << ".pSuggestedLocalWorkSize = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppSuggestedLocalWorkSize));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_sampler_create_params_t type
 /// @returns
@@ -16793,6 +16839,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_QUEUE_FLUSH: {
         os << (const struct ur_queue_flush_params_t *)params;
     } break;
+    case UR_FUNCTION_QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE: {
+        os << (const struct ur_queue_get_suggested_local_work_size_params_t *)params;
+    } break;
     case UR_FUNCTION_SAMPLER_CREATE: {
         os << (const struct ur_sampler_create_params_t *)params;
     } break;

diff --git a/scripts/core/queue.yml b/scripts/core/queue.yml
@@ -332,3 +332,49 @@ params:
 returns:
     - $X_RESULT_ERROR_INVALID_QUEUE
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Get the suggested local work-item number from runtime implementation."
+class: $xQueue
+name: GetSuggestedLocalWorkSize
+ordinal: "0"
+details:
+    - "pLocalWorkSize can be omitted in urEnqueueKernelLaunch(), but beside from"
+    - "OpenCL, LocalWorkSize will need to be calculated or guessed before enqueue"
+    - "the kernel. This function will get the LocalWorkSize value used when enqueueing"
+    - "the kernel."
+    - "TODO: find a better place for this function"
+params:
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: |
+            [in] handle of the queue object
+    - type: "$x_kernel_handle_t"
+      name: hKernel
+      desc: |
+            [in] handle of the kernel.
+    - type: uint32_t
+      name: workDim
+      desc: |
+            [in] number of dimensions, from 1 to 3, to specify the global
+            and work-group work-items
+    - type: "const size_t*"
+      name: pGlobalWorkOffset
+      desc: |
+            [in] pointer to an array of workDim unsigned values that specify
+            the offset used to calculate the global ID of a work-item"
+    - type: "const size_t*"
+      name: pGlobalWorkSize      
+      desc: |
+            [in] pointer to an array of workDim unsigned values that specify
+            the number of global work-items in workDim that will execute the 
+            kernel function
+    - type: "size_t*"
+      name: pSuggestedLocalWorkSize
+      desc: |
+            [out] pointer to an array of workDim unsigned values that specify
+            the number of local work-items forming a work-group that will 
+            execute the kernel function. 
+returns:
+    - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+    - $X_RESULT_ERROR_UNSUPPORTED_FEATURE;
@@ -577,6 +577,9 @@ etors:
 - name: DEVICE_GET_SELECTED
   desc: Enumerator for $xDeviceGetSelected
   value: '220'
+- name: QUEUE_GET_SUGGESTED_LOCAL_WORK_SIZE
+  desc: Enumerator for $xQueueGetSuggestedLocalWorkSize
+  value: '223'
 ---
 type: enum
 desc: Defines structure types

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -1756,3 +1756,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
 
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetSuggestedLocalWorkSize(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    size_t *pSuggestedLocalWorkSize) {
+  ur_context_handle_t Context = hQueue->getContext();
+  ur_device_handle_t Device = hQueue->Device;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  size_t MaxWorkGroupSize = 0u;
+  size_t ThreadsPerBlock[3] = {};
+  size_t MaxThreadsPerBlock[3] = {};
+  uint32_t LocalSize = hKernel->getLocalSize();
+
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext Active(Context);
+    {
+      MaxWorkGroupSize = Device->getMaxWorkGroupSize();
+      Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                  MaxThreadsPerBlock);
+      guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
+                         MaxThreadsPerBlock, hKernel, LocalSize);
+
+      if (MaxWorkGroupSize <
+          ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) {
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+    }
+
+    std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
+              pSuggestedLocalWorkSize);
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
@@ -1824,3 +1824,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
     uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetSuggestedLocalWorkSize(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    size_t *pSuggestedLocalWorkSize) {
+  size_t MaxThreadsPerBlock[3] = {};
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+
+  MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX();
+  MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY();
+  MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ();
+  simpleGuessLocalWorkSize(ThreadsPerBlock, pGlobalWorkSize, MaxThreadsPerBlock,
+                           hKernel);
+  std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
+            pSuggestedLocalWorkSize);
+  return UR_RESULT_SUCCESS;
+}