Merge branch 'llvm' into review/yang/dsan_nullpointer

Bensuo · Sep 13, 2024 · 3dbb7a2 · 3dbb7a2
2 parents 4b19ddc + ab5eb82
commit 3dbb7a2
Show file tree

Hide file tree

Showing 64 changed files with 1,783 additions and 1,106 deletions.
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -20,7 +20,7 @@ jobs:
         compiler: [{c: gcc, cxx: g++}]
         libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
         pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
-        latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF']
+        latency_tracking: ['-DUR_ENABLE_LATENCY_HISTOGRAM=OFF']
         include:
           - os: 'ubuntu-22.04'
             build_type: Release
@@ -40,7 +40,7 @@ jobs:
           - os: 'ubuntu-22.04'
             build_type: Release
             compiler: {c: clang, cxx: clang++}
-            latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON'
+            latency_tracking: '-DUR_ENABLE_LATENCY_HISTOGRAM=ON'
     runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}
 
     steps:

diff --git a/include/ur_print.hpp b/include/ur_print.hpp
@@ -17403,6 +17403,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) {
+    os << (value ? "true" : "false");
+    return os;
+}
+
 namespace ur::details {
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value

diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst
@@ -396,6 +396,14 @@ Specific environment variables can be set to control the behavior of unified run
 
     See the Layers_ section for details of the layers currently included in the runtime.
 
+.. envvar:: UR_LOADER_PRELOAD_FILTER
+
+    If set, the loader will read `ONEAPI_DEVICE_SELECTOR` before loading the UR Adapters to determine which backends should be loaded.
+
+    .. note::
+
+    This environment variable is default enabled on Linux, but default disabled on Windows.
+
 Service identifiers
 ---------------------
 

diff --git a/scripts/templates/print.hpp.mako b/scripts/templates/print.hpp.mako
@@ -411,6 +411,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 %endfor
 %endfor
 
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) {
+    os << (value ? "true" : "false");
+    return os;
+}
+
 namespace ${x}::details {
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value

diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
@@ -57,12 +57,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(4318u);
   }
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-    int ComputeUnits = 0;
-    UR_CHECK_ERROR(cuDeviceGetAttribute(
-        &ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-        hDevice->get()));
-    detail::ur::assertion(ComputeUnits >= 0);
-    return ReturnValue(static_cast<uint32_t>(ComputeUnits));
+    return ReturnValue(hDevice->getNumComputeUnits());
   }
   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
     return ReturnValue(MaxWorkItemDimensions);

diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
@@ -32,6 +32,7 @@ struct ur_device_handle_t_ {
   int MaxCapacityLocalMem{0};
   int MaxChosenLocalMem{0};
   bool MaxLocalMemSizeChosen{false};
+  uint32_t NumComputeUnits{0};
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
@@ -54,6 +55,10 @@ struct ur_device_handle_t_ {
                                    sizeof(MaxWorkGroupSize), &MaxWorkGroupSize,
                                    nullptr));
 
+    UR_CHECK_ERROR(cuDeviceGetAttribute(
+        reinterpret_cast<int *>(&NumComputeUnits),
+        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice));
+
     // Set local mem max size if env var is present
     static const char *LocalMemSizePtrUR =
         std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE");
@@ -107,6 +112,8 @@ struct ur_device_handle_t_ {
   int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };
 
   bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };
+
+  uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
 };
 
 int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
@@ -759,13 +759,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
         cpy_desc.dstZ = pCopyRegion->dstOffset.z;
         cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         cpy_desc.srcHost = pSrc;
-        cpy_desc.srcPitch = pCopyRegion->copyExtent.width * PixelSizeBytes;
-        cpy_desc.srcHeight = pCopyRegion->copyExtent.height;
+        cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes;
+        cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height);
         cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
         cpy_desc.dstArray = (CUarray)pDst;
         cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
         cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
-        cpy_desc.Depth = pDstImageDesc->arraySize;
+        cpy_desc.Depth = pCopyRegion->copyExtent.depth;
         UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
       }
     } else if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) {
@@ -855,10 +855,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
         cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         cpy_desc.dstHost = pDst;
         cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes;
-        cpy_desc.dstHeight = pDstImageDesc->height;
+        cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height);
         cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
         cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
-        cpy_desc.Depth = pSrcImageDesc->arraySize;
+        cpy_desc.Depth = pCopyRegion->copyExtent.depth;
         UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
       }
     } else {
@@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
         cpy_desc.dstArray = (CUarray)pDst;
         cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
         cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
-        cpy_desc.Depth = pSrcImageDesc->arraySize;
+        cpy_desc.Depth = pCopyRegion->copyExtent.depth;
         UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
       }
       // Synchronization is required here to handle the case of copying data

diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
@@ -167,10 +167,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, size_t localWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
-  (void)hKernel;
-  (void)localWorkSize;
-  (void)dynamicSharedMemorySize;
-  *pGroupCountRet = 1;
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);
+
+  // We need to set the active current device for this kernel explicitly here,
+  // because the occupancy querying API does not take device parameter.
+  ur_device_handle_t Device = hKernel->getProgram()->getDevice();
+  ScopedContext Active(Device);
+  try {
+    // We need to calculate max num of work-groups using per-device semantics.
+
+    int MaxNumActiveGroupsPerCU{0};
+    UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+        &MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
+        dynamicSharedMemorySize));
+    detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
+    // Handle the case where we can't have all SMs active with at least 1 group
+    // per SM. In that case, the device is still able to run 1 work-group, hence
+    // we will manually check if it is possible with the available HW resources.
+    if (MaxNumActiveGroupsPerCU == 0) {
+      size_t MaxWorkGroupSize{};
+      urKernelGetGroupInfo(
+          hKernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE,
+          sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr);
+      size_t MaxLocalSizeBytes{};
+      urDeviceGetInfo(Device, UR_DEVICE_INFO_LOCAL_MEM_SIZE,
+                      sizeof(MaxLocalSizeBytes), &MaxLocalSizeBytes, nullptr);
+      if (localWorkSize > MaxWorkGroupSize ||
+          dynamicSharedMemorySize > MaxLocalSizeBytes ||
+          hasExceededMaxRegistersPerBlock(Device, hKernel, localWorkSize))
+        *pGroupCountRet = 0;
+      else
+        *pGroupCountRet = 1;
+    } else {
+      // Multiply by the number of SMs (CUs = compute units) on the device in
+      // order to retreive the total number of groups/blocks that can be
+      // launched.
+      *pGroupCountRet = Device->getNumComputeUnits() * MaxNumActiveGroupsPerCU;
+    }
+  } catch (ur_result_t Err) {
+    return Err;
+  }
   return UR_RESULT_SUCCESS;
 }
 

diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
@@ -118,6 +118,7 @@ if(UR_BUILD_ADAPTER_L0)
         ${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
@@ -136,6 +137,7 @@ if(UR_BUILD_ADAPTER_L0)
         ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/image.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
     )
 
@@ -199,13 +201,15 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
         # v2-only sources
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp
@@ -217,6 +221,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/event.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp
@@ -229,6 +234,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/event.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
@@ -512,7 +512,7 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
   // Create one event ZePool per MaxNumEventsPerPool events
   if (*ZePool == nullptr) {
     ze_event_pool_counter_based_exp_desc_t counterBasedExt = {
-        ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC};
+        ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0};
     ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
     ZeEventPoolDesc.count = MaxNumEventsPerPool;
     ZeEventPoolDesc.flags = 0;

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
@@ -221,9 +221,8 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
     return UR_RESULT_SUCCESS;
   }
 
-  ur_event_handle_t InternalEvent;
+  ur_event_handle_t ResultEvent = nullptr;
   bool IsInternal = OutEvent == nullptr;
-  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
 
   // For in-order queue and wait-list which is empty or has events from
   // the same queue just use the last command event as the barrier event.
@@ -234,7 +233,10 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
                                             EventWaitList) &&
       Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
     UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent));
-    *Event = Queue->LastCommandEvent;
+    ResultEvent = Queue->LastCommandEvent;
+    if (OutEvent) {
+      *OutEvent = ResultEvent;
+    }
     return UR_RESULT_SUCCESS;
   }
 
@@ -264,16 +266,21 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
         EventWaitList, OkToBatch));
 
     // Insert the barrier into the command-list and execute.
-    UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
+    UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
+                                     IsInternal));
 
     UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
 
     // Because of the dependency between commands in the in-order queue we don't
     // need to keep track of any active barriers if we have in-order queue.
     if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
-      auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
+      auto UREvent = reinterpret_cast<ur_event_handle_t>(ResultEvent);
       Queue->ActiveBarriers.add(UREvent);
     }
+
+    if (OutEvent) {
+      *OutEvent = ResultEvent;
+    }
     return UR_RESULT_SUCCESS;
   }
 
@@ -361,14 +368,14 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
     // Insert a barrier with the events from each command-queue into the
     // convergence command list. The resulting event signals the convergence of
     // all barriers.
-    UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event,
-                                     IsInternal));
+    UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
+                                     ResultEvent, IsInternal));
   } else {
     // If there is only a single queue then insert a barrier and the single
     // result event can be used as our active barrier and used as the return
     // event. Take into account whether output event is discarded or not.
-    UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, *Event,
-                                     IsInternal));
+    UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{},
+                                     ResultEvent, IsInternal));
   }
 
   // Execute each command list so the barriers can be encountered.
@@ -384,8 +391,10 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
   }
 
   UR_CALL(Queue->ActiveBarriers.clear());
-  auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
-  Queue->ActiveBarriers.add(UREvent);
+  Queue->ActiveBarriers.add(ResultEvent);
+  if (OutEvent) {
+    *OutEvent = ResultEvent;
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -1508,8 +1517,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
 
         std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
 
-        ur_device_handle_t QueueRootDevice;
-        ur_device_handle_t CurrentQueueRootDevice;
+        ur_device_handle_t QueueRootDevice = nullptr;
+        ur_device_handle_t CurrentQueueRootDevice = nullptr;
         if (Queue) {
           QueueRootDevice = Queue->Device;
           CurrentQueueRootDevice = CurQueueDevice;

diff --git a/source/adapters/level_zero/helpers/memory_helpers.cpp b/source/adapters/level_zero/helpers/memory_helpers.cpp
@@ -0,0 +1,33 @@
+//===--------- memory_helpers.cpp - Level Zero Adapter -------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "memory_helpers.hpp"
+#include "../common.hpp"
+
+ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr) {
+  // TODO: use UMF once
+  // https://github.com/oneapi-src/unified-memory-framework/issues/687 is
+  // implemented
+  ZeStruct<ze_memory_allocation_properties_t> zeMemoryAllocationProperties;
+  ZE2UR_CALL_THROWS(zeMemGetAllocProperties,
+                    (hContext, ptr, &zeMemoryAllocationProperties, nullptr));
+  return zeMemoryAllocationProperties.type;
+}
+
+bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver,
+                    ze_context_handle_t hContext, void *ptr, size_t size) {
+  if (ZeUSMImport.Enabled && ptr != nullptr &&
+      getMemoryType(hContext, ptr) == ZE_MEMORY_TYPE_UNKNOWN) {
+    // Promote the host ptr to USM host memory
+    ZeUSMImport.doZeUSMImport(hTranslatedDriver, ptr, size);
+    return true;
+  }
+  return false;
+}
diff --git a/source/adapters/level_zero/helpers/memory_helpers.hpp b/source/adapters/level_zero/helpers/memory_helpers.hpp
@@ -0,0 +1,23 @@
+//===--------- memory_helpers.hpp - Level Zero Adapter -------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <ur_api.h>
+#include <ze_api.h>
+
+// If USM Import feature is enabled and hostptr is supplied,
+// import the hostptr if not already imported into USM.
+// Data transfer rate is maximized when both source and destination
+// are USM pointers. Promotion of the host pointer to USM thus
+// optimizes data transfer performance.
+bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver,
+                    ze_context_handle_t hContext, void *ptr, size_t size);
+
+ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr);