diff --git a/src/uct/rocm/base/rocm_signal.c b/src/uct/rocm/base/rocm_signal.c index f5273f1a2ae3..68778426d250 100644 --- a/src/uct/rocm/base/rocm_signal.c +++ b/src/uct/rocm/base/rocm_signal.c @@ -45,6 +45,7 @@ unsigned uct_rocm_base_progress(ucs_queue_head_t *signal_queue) static const unsigned max_signals = 16; unsigned count = 0; uct_rocm_base_signal_desc_t *rocm_signal; + hsa_status_t status; ucs_queue_for_each_extract(rocm_signal, signal_queue, queue, (hsa_signal_load_scacquire(rocm_signal->signal) == 0) && @@ -53,6 +54,14 @@ unsigned uct_rocm_base_progress(ucs_queue_head_t *signal_queue) uct_invoke_completion(rocm_signal->comp, UCS_OK); } + if (rocm_signal->mapped_addr != NULL) { + status = hsa_amd_ipc_memory_detach(rocm_signal->mapped_addr); + if (status != HSA_STATUS_SUCCESS) { + ucs_warn("failed to detach ipc memory region"); + } + rocm_signal->mapped_addr = NULL; + } + ucs_trace_poll("rocm signal done :%p", rocm_signal); ucs_mpool_put(rocm_signal); count++; diff --git a/src/uct/rocm/copy/rocm_copy_ep.c b/src/uct/rocm/copy/rocm_copy_ep.c index 7b675afa813c..38babf567805 100644 --- a/src/uct/rocm/copy/rocm_copy_ep.c +++ b/src/uct/rocm/copy/rocm_copy_ep.c @@ -221,7 +221,7 @@ ucs_status_t uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep, uint64_t remote_addr, ret = UCS_OK; } else { rocm_copy_signal->comp = comp; - rocm_copy_signal->mapped_addr = dst_addr; + rocm_copy_signal->mapped_addr = NULL; ucs_queue_push(&iface->signal_queue, &rocm_copy_signal->queue); } diff --git a/src/uct/rocm/ipc/rocm_ipc_ep.c b/src/uct/rocm/ipc/rocm_ipc_ep.c index e22dba06b9e3..9195eecc4084 100644 --- a/src/uct/rocm/ipc/rocm_ipc_ep.c +++ b/src/uct/rocm/ipc/rocm_ipc_ep.c @@ -94,11 +94,21 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep, return UCS_ERR_INVALID_ADDR; } - ret = uct_rocm_ipc_cache_map_memhandle((void *)ep->remote_memh_cache, key, + if (iface->config.enable_ipc_handle_cache) { + ret = uct_rocm_ipc_cache_map_memhandle((void *)ep->remote_memh_cache, key, + &remote_base_addr); + if (ucs_unlikely(ret != UCS_OK)) { + ucs_error("fail to attach ipc mem %p %d\n", (void *)key->address, ret); + return ret; + } + } else { + status = hsa_amd_ipc_memory_attach(&key->ipc, key->length, 0, NULL, &remote_base_addr); - if (ret != UCS_OK) { - ucs_error("fail to attach ipc mem %p %d\n", (void *)key->address, ret); - return ret; + if (ucs_unlikely(status != HSA_STATUS_SUCCESS)) { + ucs_error("failed to open ipc mem handle. addr:%p len:%lu", + (void *)key->address, key->length); + return UCS_ERR_INVALID_ADDR; + } } remote_copy_addr = UCS_PTR_BYTE_OFFSET(remote_base_addr, @@ -156,7 +166,11 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep, } rocm_ipc_signal->comp = comp; - rocm_ipc_signal->mapped_addr = remote_base_addr; + if (iface->config.enable_ipc_handle_cache) { + rocm_ipc_signal->mapped_addr = NULL; + } else { + rocm_ipc_signal->mapped_addr = remote_base_addr; + } ucs_queue_push(&iface->signal_queue, &rocm_ipc_signal->queue); ucs_trace("rocm async copy issued :%p remote:%p, local:%p len:%ld", diff --git a/src/uct/rocm/ipc/rocm_ipc_iface.c b/src/uct/rocm/ipc/rocm_ipc_iface.c index 4e51f04cb33a..5b2ba23f700d 100644 --- a/src/uct/rocm/ipc/rocm_ipc_iface.c +++ b/src/uct/rocm/ipc/rocm_ipc_iface.c @@ -27,7 +27,11 @@ static ucs_config_field_t uct_rocm_ipc_iface_config_table[] = { {"LAT", "1e-7", "Latency", ucs_offsetof(uct_rocm_ipc_iface_config_t, latency), UCS_CONFIG_TYPE_TIME}, - {NULL} + {"CACHE_IPC_HANDLES", "y", "Enable caching IPC handles", + ucs_offsetof(uct_rocm_ipc_iface_config_t, enable_ipc_handle_cache), + UCS_CONFIG_TYPE_BOOL}, + + {NULL} }; static double uct_rocm_ipc_iface_get_bw() @@ -201,6 +205,7 @@ static UCS_CLASS_INIT_FUNC(uct_rocm_ipc_iface_t, uct_md_h md, uct_worker_h worke self->config.min_zcopy = config->min_zcopy; self->config.latency = config->latency; + self->config.enable_ipc_handle_cache = config->enable_ipc_handle_cache; ucs_mpool_params_reset(&mp_params); mp_params.elem_size = sizeof(uct_rocm_base_signal_desc_t); diff --git a/src/uct/rocm/ipc/rocm_ipc_iface.h b/src/uct/rocm/ipc/rocm_ipc_iface.h index 9afada6e6436..946e4def13dd 100644 --- a/src/uct/rocm/ipc/rocm_ipc_iface.h +++ b/src/uct/rocm/ipc/rocm_ipc_iface.h @@ -20,6 +20,7 @@ typedef struct uct_rocm_ipc_iface { struct { size_t min_zcopy; double latency; + int enable_ipc_handle_cache; } config; } uct_rocm_ipc_iface_t; @@ -28,6 +29,7 @@ typedef struct uct_rocm_ipc_iface_config { uct_iface_config_t super; size_t min_zcopy; double latency; + int enable_ipc_handle_cache; } uct_rocm_ipc_iface_config_t; #endif