From ba0bce4b3c9c942f0aa2e0a0fd598cb6b62d2a25 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 5 Jun 2023 23:49:30 -0400 Subject: [PATCH] Ask for the modex of all jobs connected. The original code was merging the local modex with the modex of the local processes on the first jobid. This lead to incorrect, and mismatched, information among processes when joining multiple jobid processes (such as on the second spawn merged). This patch iterate over all the jobid on the list of "to connect" processes and adds their information to the local modex. Fixes #11724. Signed-off-by: George Bosilca --- ompi/dpm/dpm.c | 124 +++++++++++++++++++++++++++---------------------- 1 file changed, 69 insertions(+), 55 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 5dfcd67ce8b..656a45d8a41 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -259,6 +259,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, * fail. */ if (0 >= rportlen) { rc = rportlen; + /* no need to free here, the root has already done it and everyone else has not yet allocated the rport array */ goto exit; } @@ -406,72 +407,85 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, OPAL_LIST_DESTRUCT(&rlist); goto exit; } - if (0 < opal_list_get_size(&ilist)) { - uint32_t *peer_ranks = NULL; + if (!opal_list_is_empty(&ilist)) { int prn, nprn = 0; char *val; - uint16_t u16; opal_process_name_t wildcard_rank; + i = 0; /* start from the begining */ + /* convert the list of new procs to a proc_t array */ new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist), sizeof(ompi_proc_t *)); - /* get the list of local peers for the new procs */ - cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist); - proc = cd->p; - wildcard_rank.jobid = proc->super.proc_name.jobid; - wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; - /* retrieve the local peers */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_PEERS, - &wildcard_rank, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != val) { - char **peers = opal_argv_split(val, ','); - free(val); - nprn = opal_argv_count(peers); - peer_ranks = (uint32_t*)calloc(nprn, sizeof(uint32_t)); - for (prn = 0; NULL != peers[prn]; prn++) { - peer_ranks[prn] = strtoul(peers[prn], NULL, 10); - } - opal_argv_free(peers); - } - - i = 0; - OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) { + /* Extract the modex info for the first proc on the ilist, and then + * remove all processors in the same jobid from the list by getting + * their connection information and moving them into the proc array. + */ + do { + uint32_t *local_ranks_in_jobid = NULL; + ompi_dpm_proct_caddy_t* next = NULL; + cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist); proc = cd->p; - new_proc_list[i] = proc ; - /* ompi_proc_complete_init_single() initializes and optionally retrieves - * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without - * them, we are just fine */ - ompi_proc_complete_init_single(proc); - /* if this proc is local, then get its locality */ - if (NULL != peer_ranks) { - for (prn=0; prn < nprn; prn++) { - if (peer_ranks[prn] == proc->super.proc_name.vpid) { - /* get their locality string */ - val = NULL; - OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, - &proc->super.proc_name, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { - u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); - free(val); - } else { - /* all we can say is that it shares our node */ - u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + wildcard_rank.jobid = proc->super.proc_name.jobid; + wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; + /* retrieve the local peers for the specified jobid */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_PEERS, + &wildcard_rank, &val, PMIX_STRING); + if (OPAL_SUCCESS == rc && NULL != val) { + char **peers = opal_argv_split(val, ','); + free(val); + nprn = opal_argv_count(peers); + local_ranks_in_jobid = (uint32_t*)calloc(nprn, sizeof(uint32_t)); + for (prn = 0; NULL != peers[prn]; prn++) { + local_ranks_in_jobid[prn] = strtoul(peers[prn], NULL, 10); + } + opal_argv_free(peers); + } + + OPAL_LIST_FOREACH_SAFE(cd, next, &ilist, ompi_dpm_proct_caddy_t) { + proc = cd->p; + if( proc->super.proc_name.jobid != wildcard_rank.jobid ) + continue; /* not a proc from this jobid */ + + new_proc_list[i] = proc; + opal_list_remove_item(&ilist, (opal_list_item_t*)cd); // TODO: do we need to release cd ? + OBJ_RELEASE(cd); + /* ompi_proc_complete_init_single() initializes and optionally retrieves + * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without + * them, we are just fine */ + ompi_proc_complete_init_single(proc); + /* if this proc is local, then get its locality */ + if (NULL != local_ranks_in_jobid) { + uint16_t u16; + for (prn=0; prn < nprn; prn++) { + if (local_ranks_in_jobid[prn] == proc->super.proc_name.vpid) { + /* get their locality string */ + val = NULL; + OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, + &proc->super.proc_name, &val, PMIX_STRING); + if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { + u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); + free(val); + } else { + /* all we can say is that it shares our node */ + u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + } + proc->super.proc_flags = u16; + /* save the locality for later */ + OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name); + pval.type = PMIX_UINT16; + pval.data.uint16 = proc->super.proc_flags; + PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval); + break; } - proc->super.proc_flags = u16; - /* save the locality for later */ - OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name); - pval.type = PMIX_UINT16; - pval.data.uint16 = proc->super.proc_flags; - PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval); - break; } } + ++i; } - ++i; - } - if (NULL != peer_ranks) { - free(peer_ranks); - } + if (NULL != local_ranks_in_jobid) { + free(local_ranks_in_jobid); + } + } while (!opal_list_is_empty(&ilist)); + /* call add_procs on the new ones */ rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist))); free(new_proc_list);