Skip to content

Commit

Permalink
Merge branch 'ttg-device-support-master-coro-with-stream-tasks' of gi…
Browse files Browse the repository at this point in the history
…thub.com:devreal/ttg into ttg-device-support-master-coro-with-stream-tasks
  • Loading branch information
therault committed Nov 16, 2023
2 parents 97d0bf6 + af6f684 commit c2da1f5
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 47 deletions.
44 changes: 42 additions & 2 deletions ttg/ttg/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@


namespace ttg::device {

#if defined(TTG_HAVE_CUDA)
constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::CUDA;
#elif defined(TTG_HAVE_HIP)
constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::HIP;
#elif defined(TTG_HAVE_LEVEL_ZERO)
constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::L0;
#endif
class Device {
int m_id = 0;
ttg::ExecutionSpace m_space = ttg::ExecutionSpace::Host;
Expand Down Expand Up @@ -36,11 +44,11 @@ namespace ttg::device {
}

bool is_device() const {
return ((!is_invalid()) && (m_space != ttg::ExecutionSpace::Host));
return !is_host();
}

bool is_host() const {
return (m_space == ttg::ExecutionSpace::Host);
return !is_invalid() && (m_space == ttg::ExecutionSpace::Host);
}

bool is_invalid() const {
Expand Down Expand Up @@ -120,4 +128,36 @@ namespace ttg::device {
}
} // namespace ttg

#elif defined(TTG_HAVE_LEVEL_ZERO)

#include <CL/sycl.hpp>

namespace ttg::device {
namespace detail {
inline thread_local ttg::device::Device current_device_ts = {};
inline thread_local sycl::queue* current_stream_ts = nullptr; // default stream

void reset_current() {
current_device_ts = {};
current_stream_ts = nullptr;
}

void set_current(int device, sycl::queue& stream) {
current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::HIP);
current_stream_ts = &stream;
}
} // namespace detail

inline
Device current_device() {
return detail::current_device_ts;
}

inline
const sycl::queue& current_stream() {
return *detail::current_stream_ts;
}
} // namespace ttg


#endif // defined(TTG_HAVE_HIP)
1 change: 1 addition & 0 deletions ttg/ttg/execution.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ enum class ExecutionSpace {
Host, // a CPU
CUDA, // an NVIDIA CUDA device
HIP, // an AMD HIP device
L0, // an Intel L0 device
Invalid
};

Expand Down
45 changes: 23 additions & 22 deletions ttg/ttg/parsec/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "ttg/parsec/parsec-ext.h"
#include "ttg/util/iovec.h"
#include "ttg/device/device.h"
#include "ttg/parsec/device.h"

#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
#include <cuda_runtime.h>
Expand Down Expand Up @@ -75,9 +76,6 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t

public:

/* The device ID of the CPU. */
static constexpr int cpu_device = -2;

buffer() : buffer(nullptr, 0)
{ }

Expand Down Expand Up @@ -161,33 +159,35 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t

/* set the current device, useful when a device
* buffer was modified outside of a TTG */
void set_current_device(int device_id) {
void set_current_device(const ttg::device::Device& device) {
assert(is_valid());
/* make sure it's a valid device */
assert(parsec_nb_devices > device_id);
/* make sure it's a valid copy */
assert(m_data->device_copies[device_id+2] != nullptr);
m_data->owner_device = device_id+2;
int parsec_id = detail::device_to_parsec_device(device);
assert(m_data->device_copies[parsec_id] != nullptr);
m_data->owner_device = parsec_id;
}

/* Get the owner device ID, i.e., the last updated
* device buffer. A value of -2 designates the host
* as the current device. */
int get_owner_device() const {
* device buffer. */
ttg::device::Device get_owner_device() const {
assert(is_valid());
return m_data->owner_device - 2; // 0: host, 1: recursive, 2: first device
return detail::parsec_device_to_device(m_data->owner_device);
}

/* Get the pointer on the currently active device. */
element_type* current_device_ptr() {
assert(is_valid());
return static_cast<element_type*>(m_data->device_copies[ttg::device::current_device()+2]->device_private);
int device_id = ttg::device::current_device()+detail::first_device_id;
return static_cast<element_type*>(m_data->device_copies[device_id]->device_private);
}

/* Get the pointer on the currently active device. */
const element_type* current_device_ptr() const {
assert(is_valid());
return static_cast<element_type*>(m_data->device_copies[ttg::device::current_device()+2]->device_private);
int device_id = ttg::device::current_device()+detail::first_device_id;
return static_cast<element_type*>(m_data->device_copies[device_id]->device_private);
}

/* Get the pointer on the owning device.
Expand All @@ -205,19 +205,19 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
}

/* get the device pointer at the given device
* \sa cpu_device
*/
element_type* device_ptr_on(int device_id) {
element_type* device_ptr_on(const ttg::device::Device& device) {
assert(is_valid());
return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id + 2));
int device_id = detail::device_to_parsec_device(device);
return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
}

/* get the device pointer at the given device
* \sa cpu_device
*/
const element_type* device_ptr_on(int device_id) const {
const element_type* device_ptr_on(const ttg::device::Device& device) const {
assert(is_valid());
return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id + 2)); // GPUs start at 2
int device_id = detail::device_to_parsec_device(device);
return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
}

element_type* host_ptr() {
Expand All @@ -228,12 +228,13 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
}

bool is_valid_on(int device_id) const {
bool is_valid_on(const ttg::device::Device& device) const {
assert(is_valid());
return (parsec_data_get_ptr(m_data.get(), device_id+2) != nullptr);
int device_id = detail::device_to_parsec_device(device);
return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr);
}

void allocate_on(int device_id) {
void allocate_on(const ttg::device::Device& device_id) {
/* TODO: need exposed PaRSEC memory allocator */
throw std::runtime_error("not implemented yet");
}
Expand All @@ -252,7 +253,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
/* Unpin the memory on all devices we currently track. */
void unpin() {
if (!is_valid()) return;
for (int i = 0; i < parsec_nb_devices-2; ++i) {
for (int i = 0; i < parsec_nb_devices-detail::first_device_id; ++i) {
unpin_on(i);
}
}
Expand Down
40 changes: 40 additions & 0 deletions ttg/ttg/parsec/device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef TTG_PARSEC_DEVICE_H
#define TTG_PARSEC_DEVICE_H

#include "ttg/device/device.h"

namespace ttg_parsec {

namespace detail {

// the first ID of an accelerator in the parsec ID-space
inline int first_device_id = 0;

/**
* map from TTG ID-space to parsec ID-space
*/
inline
int device_to_parsec_device(const ttg::device::Device& device) {
if (device.is_host()) {
return 0;
} else {
return device.id() + first_device_id;
}
}

/**
* map from parsec ID-space to TTG ID-space
*/
inline
ttg::device::Device parsec_device_to_device(int parsec_id) {
if (parsec_id < first_device_id) {
return ttg::device::Device(parsec_id, ttg::ExecutionSpace::Host);
}
return ttg::device::Device(parsec_id - first_device_id,
ttg::device::available_execution_space);
}
} // namespace detail

} // namespace ttg_parsec

#endif // TTG_PARSEC_DEVICE_H
62 changes: 39 additions & 23 deletions ttg/ttg/parsec/ttg.h
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,14 @@ namespace ttg_parsec {
std::shared_ptr<ttg::base::WorldImplBase> world_sptr{static_cast<ttg::base::WorldImplBase *>(world_ptr)};
ttg::World world{std::move(world_sptr)};
ttg::detail::set_default_world(std::move(world));

// query the first device ID
for (int i = 0; i < parsec_nb_devices; ++i) {
if (parsec_mca_device_is_gpu(i)) {
detail::first_device_id = i;
break;
}
}
}
inline void ttg_finalize() {
// We need to notify the current taskpool of termination if we are in user termination detection mode
Expand Down Expand Up @@ -1329,19 +1337,28 @@ namespace ttg_parsec {
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
{
parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU
int device = gpu_device->super.device_index - detail::first_device_id;
ttg::device::detail::set_current(device, cuda_stream->cuda_stream);
}
#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)

#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP)
{
parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU
int device = gpu_device->super.device_index - detail::first_device_id;
ttg::device::detail::set_current(device, hip_stream->hip_stream);
}
#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)

#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) && defined(TTG_HAVE_LEVEL_ZERO)
{
parsec_level_zero_exec_stream_t *stream;
stream = (parsec_level_zero_exec_stream_t *)gpu_stream;
int device = gpu_device->super.device_index - detail::first_device_id;
ttg::device::detail::set_current(device, stream->swq->queue);
}
#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)

/* Here we call back into the coroutine again after the transfers have completed */
static_op<Space>(&task->parsec_task);

Expand Down Expand Up @@ -1394,24 +1411,14 @@ namespace ttg_parsec {
}
}
}
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
static int
static_cuda_stage_in(parsec_gpu_task_t *gtask,
uint32_t flow_mask,
parsec_gpu_exec_stream_t *gpu_stream) {
static_device_stage_in(gtask, flow_mask, gpu_stream);
return parsec_default_cuda_stage_in(gtask, flow_mask, gpu_stream);
}
#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)

static int
static_hip_stage_in(parsec_gpu_task_t *gtask,
uint32_t flow_mask,
parsec_gpu_exec_stream_t *gpu_stream) {
static_device_stage_in_hook(parsec_gpu_task_t *gtask,
uint32_t flow_mask,
parsec_gpu_exec_stream_t *gpu_stream) {
static_device_stage_in(gtask, flow_mask, gpu_stream);
return parsec_default_hip_stage_in(gtask, flow_mask, gpu_stream);
return parsec_default_gpu_stage_in(gtask, flow_mask, gpu_stream);
}
#endif

template <ttg::ExecutionSpace Space>
static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) {
Expand Down Expand Up @@ -1493,21 +1500,30 @@ namespace ttg_parsec {
if constexpr (Space == ttg::ExecutionSpace::CUDA) {
/* TODO: we need custom staging functions because PaRSEC looks at the
* task-class to determine the number of flows. */
gpu_task->stage_in = static_cuda_stage_in;
gpu_task->stage_out = parsec_default_cuda_stage_out;
return parsec_cuda_kernel_scheduler(es, gpu_task, dev_index);
gpu_task->stage_in = static_device_stage_in_hook;
gpu_task->stage_out = parsec_default_gpu_stage_out;
return parsec_device_kernel_scheduler(device, es, gpu_task);
}
break;
#endif
#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
case PARSEC_DEV_HIP:
if constexpr (Space == ttg::ExecutionSpace::HIP) {
gpu_task->stage_in = static_hip_stage_in;
gpu_task->stage_out = parsec_default_hip_stage_out;
return parsec_hip_kernel_scheduler(es, gpu_task, dev_index);
gpu_task->stage_in = static_device_stage_in_hook;
gpu_task->stage_out = parsec_default_gpu_stage_out;
return parsec_device_kernel_scheduler(device, es, gpu_task);
}
break;
#endif // PARSEC_HAVE_DEV_HIP_SUPPORT
#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
case PARSEC_DEV_LEVEL_ZERO:
if constexpr (Space == ttg::ExecutionSpace::L0) {
gpu_task->stage_in = static_device_stage_in_hook;
gpu_task->stage_out = parsec_default_gpu_stage_out;
return parsec_device_kernel_scheduler(device, es, gpu_task);
}
break;
#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
default:
break;
}
Expand Down

0 comments on commit c2da1f5

Please sign in to comment.