Skip to content

Commit

Permalink
Merge branch 'llvm' into review/yang/dsan_nullpointer
Browse files Browse the repository at this point in the history
  • Loading branch information
AllanZyne committed Sep 13, 2024
2 parents 4b19ddc + ab5eb82 commit 3dbb7a2
Show file tree
Hide file tree
Showing 64 changed files with 1,783 additions and 1,106 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
compiler: [{c: gcc, cxx: g++}]
libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
latency_tracking: ['-DUMF_ENABLE_LATENCY_TRACKING=OFF']
latency_tracking: ['-DUR_ENABLE_LATENCY_HISTOGRAM=OFF']
include:
- os: 'ubuntu-22.04'
build_type: Release
Expand All @@ -40,7 +40,7 @@ jobs:
- os: 'ubuntu-22.04'
build_type: Release
compiler: {c: clang, cxx: clang++}
latency_tracking: '-DUMF_ENABLE_LATENCY_TRACKING=ON'
latency_tracking: '-DUR_ENABLE_LATENCY_HISTOGRAM=ON'
runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}

steps:
Expand Down
5 changes: 5 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17403,6 +17403,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
return os;
}

inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) {
os << (value ? "true" : "false");
return os;
}

namespace ur::details {
///////////////////////////////////////////////////////////////////////////////
// @brief Print pointer value
Expand Down
8 changes: 8 additions & 0 deletions scripts/core/INTRO.rst
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,14 @@ Specific environment variables can be set to control the behavior of unified run

See the Layers_ section for details of the layers currently included in the runtime.

.. envvar:: UR_LOADER_PRELOAD_FILTER

If set, the loader will read `ONEAPI_DEVICE_SELECTOR` before loading the UR Adapters to determine which backends should be loaded.

.. note::

This environment variable is default enabled on Linux, but default disabled on Windows.

Service identifiers
---------------------

Expand Down
5 changes: 5 additions & 0 deletions scripts/templates/print.hpp.mako
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
%endfor
%endfor

inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const ur_bool_t value) {
os << (value ? "true" : "false");
return os;
}

namespace ${x}::details {
///////////////////////////////////////////////////////////////////////////////
// @brief Print pointer value
Expand Down
7 changes: 1 addition & 6 deletions source/adapters/cuda/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(4318u);
}
case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
int ComputeUnits = 0;
UR_CHECK_ERROR(cuDeviceGetAttribute(
&ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
hDevice->get()));
detail::ur::assertion(ComputeUnits >= 0);
return ReturnValue(static_cast<uint32_t>(ComputeUnits));
return ReturnValue(hDevice->getNumComputeUnits());
}
case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
return ReturnValue(MaxWorkItemDimensions);
Expand Down
7 changes: 7 additions & 0 deletions source/adapters/cuda/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ struct ur_device_handle_t_ {
int MaxCapacityLocalMem{0};
int MaxChosenLocalMem{0};
bool MaxLocalMemSizeChosen{false};
uint32_t NumComputeUnits{0};

public:
ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
Expand All @@ -54,6 +55,10 @@ struct ur_device_handle_t_ {
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize,
nullptr));

UR_CHECK_ERROR(cuDeviceGetAttribute(
reinterpret_cast<int *>(&NumComputeUnits),
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice));

// Set local mem max size if env var is present
static const char *LocalMemSizePtrUR =
std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE");
Expand Down Expand Up @@ -107,6 +112,8 @@ struct ur_device_handle_t_ {
int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };

bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };

uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
};

int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
12 changes: 6 additions & 6 deletions source/adapters/cuda/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,13 +759,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.dstZ = pCopyRegion->dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.srcHost = pSrc;
cpy_desc.srcPitch = pCopyRegion->copyExtent.width * PixelSizeBytes;
cpy_desc.srcHeight = pCopyRegion->copyExtent.height;
cpy_desc.srcPitch = pSrcImageDesc->width * PixelSizeBytes;
cpy_desc.srcHeight = std::max(uint64_t{1}, pSrcImageDesc->height);
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
cpy_desc.Depth = pDstImageDesc->arraySize;
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
}
} else if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_HOST) {
Expand Down Expand Up @@ -855,10 +855,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.dstHost = pDst;
cpy_desc.dstPitch = pDstImageDesc->width * PixelSizeBytes;
cpy_desc.dstHeight = pDstImageDesc->height;
cpy_desc.dstHeight = std::max(uint64_t{1}, pDstImageDesc->height);
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
cpy_desc.Depth = pSrcImageDesc->arraySize;
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
}
} else {
Expand Down Expand Up @@ -932,7 +932,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
cpy_desc.dstArray = (CUarray)pDst;
cpy_desc.WidthInBytes = PixelSizeBytes * pCopyRegion->copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, pCopyRegion->copyExtent.height);
cpy_desc.Depth = pSrcImageDesc->arraySize;
cpy_desc.Depth = pCopyRegion->copyExtent.depth;
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
}
// Synchronization is required here to handle the case of copying data
Expand Down
44 changes: 40 additions & 4 deletions source/adapters/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
ur_kernel_handle_t hKernel, size_t localWorkSize,
size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
(void)hKernel;
(void)localWorkSize;
(void)dynamicSharedMemorySize;
*pGroupCountRet = 1;
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);

// We need to set the active current device for this kernel explicitly here,
// because the occupancy querying API does not take device parameter.
ur_device_handle_t Device = hKernel->getProgram()->getDevice();
ScopedContext Active(Device);
try {
// We need to calculate max num of work-groups using per-device semantics.

int MaxNumActiveGroupsPerCU{0};
UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor(
&MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
dynamicSharedMemorySize));
detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
// Handle the case where we can't have all SMs active with at least 1 group
// per SM. In that case, the device is still able to run 1 work-group, hence
// we will manually check if it is possible with the available HW resources.
if (MaxNumActiveGroupsPerCU == 0) {
size_t MaxWorkGroupSize{};
urKernelGetGroupInfo(
hKernel, Device, UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE,
sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr);
size_t MaxLocalSizeBytes{};
urDeviceGetInfo(Device, UR_DEVICE_INFO_LOCAL_MEM_SIZE,
sizeof(MaxLocalSizeBytes), &MaxLocalSizeBytes, nullptr);
if (localWorkSize > MaxWorkGroupSize ||
dynamicSharedMemorySize > MaxLocalSizeBytes ||
hasExceededMaxRegistersPerBlock(Device, hKernel, localWorkSize))
*pGroupCountRet = 0;
else
*pGroupCountRet = 1;
} else {
// Multiply by the number of SMs (CUs = compute units) on the device in
// order to retreive the total number of groups/blocks that can be
// launched.
*pGroupCountRet = Device->getNumComputeUnits() * MaxNumActiveGroupsPerCU;
}
} catch (ur_result_t Err) {
return Err;
}
return UR_RESULT_SUCCESS;
}

Expand Down
6 changes: 6 additions & 0 deletions source/adapters/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ if(UR_BUILD_ADAPTER_L0)
${CMAKE_CURRENT_SOURCE_DIR}/queue.hpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_level_zero.cpp
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
Expand All @@ -136,6 +137,7 @@ if(UR_BUILD_ADAPTER_L0)
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/image.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
)

Expand Down Expand Up @@ -199,13 +201,15 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/platform.hpp
${CMAKE_CURRENT_SOURCE_DIR}/program.hpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.hpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.hpp
${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
# v2-only sources
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_list_cache.hpp
Expand All @@ -217,6 +221,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp
Expand All @@ -229,6 +234,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
${CMAKE_CURRENT_SOURCE_DIR}/v2/event_provider_normal.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/event.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp
Expand Down
2 changes: 1 addition & 1 deletion source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
// Create one event ZePool per MaxNumEventsPerPool events
if (*ZePool == nullptr) {
ze_event_pool_counter_based_exp_desc_t counterBasedExt = {
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC};
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0};
ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
ZeEventPoolDesc.count = MaxNumEventsPerPool;
ZeEventPoolDesc.flags = 0;
Expand Down
35 changes: 22 additions & 13 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,8 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
return UR_RESULT_SUCCESS;
}

ur_event_handle_t InternalEvent;
ur_event_handle_t ResultEvent = nullptr;
bool IsInternal = OutEvent == nullptr;
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;

// For in-order queue and wait-list which is empty or has events from
// the same queue just use the last command event as the barrier event.
Expand All @@ -234,7 +233,10 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
EventWaitList) &&
Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
UR_CALL(ur::level_zero::urEventRetain(Queue->LastCommandEvent));
*Event = Queue->LastCommandEvent;
ResultEvent = Queue->LastCommandEvent;
if (OutEvent) {
*OutEvent = ResultEvent;
}
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -264,16 +266,21 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
EventWaitList, OkToBatch));

// Insert the barrier into the command-list and execute.
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, ResultEvent,
IsInternal));

UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));

// Because of the dependency between commands in the in-order queue we don't
// need to keep track of any active barriers if we have in-order queue.
if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
auto UREvent = reinterpret_cast<ur_event_handle_t>(ResultEvent);
Queue->ActiveBarriers.add(UREvent);
}

if (OutEvent) {
*OutEvent = ResultEvent;
}
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -361,14 +368,14 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
// Insert a barrier with the events from each command-queue into the
// convergence command list. The resulting event signals the convergence of
// all barriers.
UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event,
IsInternal));
UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
ResultEvent, IsInternal));
} else {
// If there is only a single queue then insert a barrier and the single
// result event can be used as our active barrier and used as the return
// event. Take into account whether output event is discarded or not.
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, *Event,
IsInternal));
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{},
ResultEvent, IsInternal));
}

// Execute each command list so the barriers can be encountered.
Expand All @@ -384,8 +391,10 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
}

UR_CALL(Queue->ActiveBarriers.clear());
auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
Queue->ActiveBarriers.add(UREvent);
Queue->ActiveBarriers.add(ResultEvent);
if (OutEvent) {
*OutEvent = ResultEvent;
}
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -1508,8 +1517,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(

std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);

ur_device_handle_t QueueRootDevice;
ur_device_handle_t CurrentQueueRootDevice;
ur_device_handle_t QueueRootDevice = nullptr;
ur_device_handle_t CurrentQueueRootDevice = nullptr;
if (Queue) {
QueueRootDevice = Queue->Device;
CurrentQueueRootDevice = CurQueueDevice;
Expand Down
33 changes: 33 additions & 0 deletions source/adapters/level_zero/helpers/memory_helpers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//===--------- memory_helpers.cpp - Level Zero Adapter -------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "memory_helpers.hpp"
#include "../common.hpp"

ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr) {
// TODO: use UMF once
// https://github.com/oneapi-src/unified-memory-framework/issues/687 is
// implemented
ZeStruct<ze_memory_allocation_properties_t> zeMemoryAllocationProperties;
ZE2UR_CALL_THROWS(zeMemGetAllocProperties,
(hContext, ptr, &zeMemoryAllocationProperties, nullptr));
return zeMemoryAllocationProperties.type;
}

bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver,
ze_context_handle_t hContext, void *ptr, size_t size) {
if (ZeUSMImport.Enabled && ptr != nullptr &&
getMemoryType(hContext, ptr) == ZE_MEMORY_TYPE_UNKNOWN) {
// Promote the host ptr to USM host memory
ZeUSMImport.doZeUSMImport(hTranslatedDriver, ptr, size);
return true;
}
return false;
}
23 changes: 23 additions & 0 deletions source/adapters/level_zero/helpers/memory_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//===--------- memory_helpers.hpp - Level Zero Adapter -------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#pragma once

#include <ur_api.h>
#include <ze_api.h>

// If USM Import feature is enabled and hostptr is supplied,
// import the hostptr if not already imported into USM.
// Data transfer rate is maximized when both source and destination
// are USM pointers. Promotion of the host pointer to USM thus
// optimizes data transfer performance.
bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver,
ze_context_handle_t hContext, void *ptr, size_t size);

ze_memory_type_t getMemoryType(ze_context_handle_t hContext, void *ptr);
Loading

0 comments on commit 3dbb7a2

Please sign in to comment.