Skip to content

Commit

Permalink
ofi: follow user specified include/exclude list to select providers
Browse files Browse the repository at this point in the history
This PR addresses #12233

Since 5.0.x, we introduced an optional FI_HMEM capability in ofi provider
selection logic(both mtl and btl) in order to support accelerator memory.
As described in the issue, this introduced a bug that can cause the wrong
ofi provider to be selected, even if the user explicitly includes/excludes
the provider name.

This change refactors the selection logic to correctly handle the
include/exclude list, and therefore fixes the bug.

Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
  • Loading branch information
wenduwan committed Jan 16, 2024
1 parent 5fa32f7 commit 29efcef
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 22 deletions.
10 changes: 9 additions & 1 deletion ompi/mca/mtl/ofi/mtl_ofi_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -764,12 +764,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
"%s:%d: fi_getinfo(): %s\n",
__FILE__, __LINE__, fi_strerror(-ret));

if (FI_ENODATA == -ret) {
if ((FI_ENODATA == -ret)
|| (0 == ret && include_list
&& 0 == opal_common_ofi_count_providers_in_list(providers, include_list))
|| (0 == ret && !include_list && exclude_list
&& opal_common_ofi_providers_subset_of_list(providers, exclude_list))) {
#if defined(FI_HMEM)
/* Attempt selecting a provider without FI_HMEM hints */
if (hints->caps & FI_HMEM) {
hints->caps &= ~FI_HMEM;
hints->domain_attr->mr_mode &= ~FI_MR_HMEM;
if (providers) {
(void) fi_freeinfo(providers);
providers = NULL;
}
goto no_hmem;
}
#endif
Expand Down
47 changes: 33 additions & 14 deletions opal/mca/btl/ofi/btl_ofi_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
int rc;
uint64_t progress_mode;
unsigned resource_count = 0;
struct mca_btl_base_module_t **base_modules;
struct mca_btl_base_module_t **base_modules = NULL;
char **include_list = NULL, **exclude_list = NULL;

BTL_VERBOSE(("initializing ofi btl"));
Expand All @@ -275,7 +275,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
return NULL;
}

struct fi_info *info, *info_list, *selected_info;
struct fi_info *info, *info_list = NULL, *selected_info = NULL;
struct fi_info hints = {0};
struct fi_ep_attr ep_attr = {0};
struct fi_rx_attr rx_attr = {0};
Expand Down Expand Up @@ -366,22 +366,31 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
* The earliest version the explictly allow provider to call CUDA API is 1.18 */
rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list);
if (FI_ENOSYS == -rc) {
rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list);
rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list);
}
if (0 != rc) {

if ((FI_ENODATA == -rc)
|| (0 == rc && include_list
&& 0 == opal_common_ofi_count_providers_in_list(info_list, include_list))
|| (0 == rc && !include_list && exclude_list
&& opal_common_ofi_providers_subset_of_list(info_list, exclude_list))) {
#if defined(FI_HMEM)
/* Attempt selecting a provider without FI_HMEM hints */
if (hints.caps & FI_HMEM) {
/* Try again without FI_HMEM hints */
hints.caps &= ~FI_HMEM;
hints.domain_attr->mr_mode &= ~FI_MR_HMEM;
if (info_list) {
(void) fi_freeinfo(info_list);
info_list = NULL;
}
goto no_hmem;
}
#endif
/* It is not an error if no information is returned. */
goto out;
} else if (0 != rc) {
BTL_VERBOSE(("fi_getinfo failed with code %d: %s", rc, fi_strerror(-rc)));
if (NULL != include_list) {
opal_argv_free(include_list);
}
return NULL;
goto out;
}

#if defined(FI_HMEM)
Expand Down Expand Up @@ -441,16 +450,15 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
info = info->next;
}

/* We are done with the returned info. */
fi_freeinfo(info_list);
if (NULL != include_list) {
opal_argv_free(include_list);
if (NULL == info) {
BTL_VERBOSE(("No provider is selected"));
goto out;
}

/* pass module array back to caller */
base_modules = calloc(mca_btl_ofi_component.module_count, sizeof(*base_modules));
if (NULL == base_modules) {
return NULL;
goto out;
}

memcpy(base_modules, mca_btl_ofi_component.modules,
Expand All @@ -461,6 +469,17 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,

*num_btl_modules = mca_btl_ofi_component.module_count;

out:
if (include_list) {
opal_argv_free(include_list);
}
if (exclude_list) {
opal_argv_free(exclude_list);
}
if (info_list) {
(void) fi_freeinfo(info_list);
}

return base_modules;
}

Expand Down
54 changes: 47 additions & 7 deletions opal/mca/common/ofi/common_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ static int opal_common_ofi_init_ref_cnt = 0;
static bool opal_common_ofi_installed_memory_monitor = false;
#endif

/* Count providers returns the number of providers present in an fi_info list
* @param (IN) provider_list struct fi_info* list of providers available
*
* @param (OUT) int number of providers present in the list
*
* returns 0 if the list is NULL
*/
static int count_providers(struct fi_info *provider_list);

#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR

/*
Expand Down Expand Up @@ -272,6 +281,44 @@ int opal_common_ofi_is_in_list(char **list, char *item)
return 0;
}

int opal_common_ofi_count_providers_in_list(struct fi_info *provider_list, char **list)
{
int count = 0, matched = 0;
struct fi_info *prov = provider_list, *prev_prov = NULL;
char *name;

while (prov) {
name = prov->fabric_attr->prov_name;
if (prev_prov && !strncasecmp(prev_prov->fabric_attr->prov_name, name, strlen(name))) {
/**
* Providers are usually sorted by name. We can reuse the previous matching result and
* avoid the potentially expensive list traversal.
*/
count += matched;
} else if (opal_common_ofi_is_in_list(list, prov->fabric_attr->prov_name)) {
matched = 1;
++count;
} else {
matched = 0;
}
prev_prov = prov;
prov = prov->next;
}

return count;
}

int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char **list)
{
int num_prov = count_providers(provider_list);

if (!num_prov) {
return 1;
}

return num_prov == opal_common_ofi_count_providers_in_list(provider_list, list);
}

int opal_common_ofi_mca_register(const mca_base_component_t *component)
{
static int include_index = -1;
Expand Down Expand Up @@ -737,13 +784,6 @@ static struct fi_info *select_provider_round_robin(struct fi_info *provider_list
return current_provider;
}

/* Count providers returns the number of providers present in an fi_info list
* @param (IN) provider_list struct fi_info* list of providers available
*
* @param (OUT) int number of providers present in the list
*
* returns 0 if the list is NULL
*/
static int count_providers(struct fi_info *provider_list)
{
struct fi_info *dev = provider_list;
Expand Down
32 changes: 32 additions & 0 deletions opal/mca/common/ofi/common_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,38 @@ OPAL_DECLSPEC int opal_common_ofi_export_memory_monitor(void);
*/
OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);

/**
* Get the number of providers whose names are included in a list
*
* This function takes a list of providers and a list of name strings
* as inputs, and return the number of providers whose names are included
* in the name strings.
*
* @param provider_list (IN) List of providers
* @param list (IN) List of name string
*
* @return Number of matched providers
*
*/
OPAL_DECLSPEC int opal_common_ofi_count_providers_in_list(struct fi_info *provider_list,
char **list);

/**
* Determine whether all providers are included in a list
*
* This function takes a list of providers and a list of name strings
* as inputs, and return whether all provider names are included in the name strings.
*
* @param provider_list (IN) List of providers
* @param list (IN) List of name string
*
* @return 0 At least one provider's name is not included in the name strings.
* @return 1 All provider names are included in the name strings.
*
*/
OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list,
char **list);

/**
* Selects NIC (provider) based on hardware locality
*
Expand Down

0 comments on commit 29efcef

Please sign in to comment.