Skip to content

Commit

Permalink
UCT/ROCM: add control of ipc cache usage
Browse files Browse the repository at this point in the history
add a UCX parameter that allows to control whether we want to use the
ipc handle cache.
  • Loading branch information
edgargabriel committed Feb 17, 2025
1 parent baf8cb2 commit 17166be
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 6 deletions.
9 changes: 9 additions & 0 deletions src/uct/rocm/base/rocm_signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ unsigned uct_rocm_base_progress(ucs_queue_head_t *signal_queue)
static const unsigned max_signals = 16;
unsigned count = 0;
uct_rocm_base_signal_desc_t *rocm_signal;
hsa_status_t status;

ucs_queue_for_each_extract(rocm_signal, signal_queue, queue,
(hsa_signal_load_scacquire(rocm_signal->signal) == 0) &&
Expand All @@ -53,6 +54,14 @@ unsigned uct_rocm_base_progress(ucs_queue_head_t *signal_queue)
uct_invoke_completion(rocm_signal->comp, UCS_OK);
}

if (rocm_signal->mapped_addr != NULL) {
status = hsa_amd_ipc_memory_detach(rocm_signal->mapped_addr);
if (status != HSA_STATUS_SUCCESS) {
ucs_warn("failed to detach ipc memory region");
}
rocm_signal->mapped_addr = NULL;
}

ucs_trace_poll("rocm signal done :%p", rocm_signal);
ucs_mpool_put(rocm_signal);
count++;
Expand Down
2 changes: 1 addition & 1 deletion src/uct/rocm/copy/rocm_copy_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ ucs_status_t uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep, uint64_t remote_addr,
ret = UCS_OK;
} else {
rocm_copy_signal->comp = comp;
rocm_copy_signal->mapped_addr = dst_addr;
rocm_copy_signal->mapped_addr = NULL;
ucs_queue_push(&iface->signal_queue, &rocm_copy_signal->queue);
}

Expand Down
25 changes: 20 additions & 5 deletions src/uct/rocm/ipc/rocm_ipc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,22 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep,
return UCS_ERR_INVALID_ADDR;
}

ret = uct_rocm_ipc_cache_map_memhandle((void *)ep->remote_memh_cache, key,
if (iface->config.enable_ipc_handle_cache) {
ret = uct_rocm_ipc_cache_map_memhandle((void*)ep->remote_memh_cache,
key, &remote_base_addr);
if (ucs_unlikely(ret != UCS_OK)) {
ucs_error("fail to attach ipc mem %p %d\n", (void*)key->address,
ret);
return ret;
}
} else {
status = hsa_amd_ipc_memory_attach(&key->ipc, key->length, 0, NULL,
&remote_base_addr);
if (ret != UCS_OK) {
ucs_error("fail to attach ipc mem %p %d\n", (void *)key->address, ret);
return ret;
if (ucs_unlikely(status != HSA_STATUS_SUCCESS)) {
ucs_error("failed to open ipc mem handle. addr:%p len:%lu",
(void*)key->address, key->length);
return UCS_ERR_INVALID_ADDR;
}
}

remote_copy_addr = UCS_PTR_BYTE_OFFSET(remote_base_addr,
Expand Down Expand Up @@ -156,7 +167,11 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep,
}

rocm_ipc_signal->comp = comp;
rocm_ipc_signal->mapped_addr = remote_base_addr;
if (iface->config.enable_ipc_handle_cache) {
rocm_ipc_signal->mapped_addr = NULL;
} else {
rocm_ipc_signal->mapped_addr = remote_base_addr;
}
ucs_queue_push(&iface->signal_queue, &rocm_ipc_signal->queue);

ucs_trace("rocm async copy issued :%p remote:%p, local:%p len:%ld",
Expand Down
5 changes: 5 additions & 0 deletions src/uct/rocm/ipc/rocm_ipc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ static ucs_config_field_t uct_rocm_ipc_iface_config_table[] = {
{"LAT", "1e-7", "Latency",
ucs_offsetof(uct_rocm_ipc_iface_config_t, latency), UCS_CONFIG_TYPE_TIME},

{"CACHE_IPC_HANDLES", "y", "Enable caching IPC handles",
ucs_offsetof(uct_rocm_ipc_iface_config_t, enable_ipc_handle_cache),
UCS_CONFIG_TYPE_BOOL},

{NULL}
};

Expand Down Expand Up @@ -201,6 +205,7 @@ static UCS_CLASS_INIT_FUNC(uct_rocm_ipc_iface_t, uct_md_h md, uct_worker_h worke

self->config.min_zcopy = config->min_zcopy;
self->config.latency = config->latency;
self->config.enable_ipc_handle_cache = config->enable_ipc_handle_cache;

ucs_mpool_params_reset(&mp_params);
mp_params.elem_size = sizeof(uct_rocm_base_signal_desc_t);
Expand Down
2 changes: 2 additions & 0 deletions src/uct/rocm/ipc/rocm_ipc_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ typedef struct uct_rocm_ipc_iface {
struct {
size_t min_zcopy;
double latency;
int enable_ipc_handle_cache;
} config;

} uct_rocm_ipc_iface_t;
Expand All @@ -28,6 +29,7 @@ typedef struct uct_rocm_ipc_iface_config {
uct_iface_config_t super;
size_t min_zcopy;
double latency;
int enable_ipc_handle_cache;
} uct_rocm_ipc_iface_config_t;

#endif

0 comments on commit 17166be

Please sign in to comment.