From 2f7f1feca5d4bec127b33c73aa5e19a4e0c93dbc Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 10 Nov 2020 16:41:00 -0800 Subject: [PATCH] Fix confusion between cpuset and locality Ensure we correctly collect and save the cpuset of the process separately from its locality string. Ensure we use the correct one when computing things like relative locality between processes. Signed-off-by: Ralph Castain --- ompi/dpm/dpm.c | 19 +++---------------- ompi/runtime/ompi_rte.c | 16 +++++++++++----- opal/mca/common/ofi/common_ofi.c | 4 ++-- opal/util/proc.c | 1 + opal/util/proc.h | 1 + 5 files changed, 18 insertions(+), 23 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index b77b21586a5..a289c16813a 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -355,7 +355,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, if (0 < opal_list_get_size(&ilist)) { uint32_t *peer_ranks = NULL; int prn, nprn = 0; - char *val, *mycpuset; + char *val; uint16_t u16; opal_process_name_t wildcard_rank; /* convert the list of new procs to a proc_t array */ @@ -380,16 +380,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, opal_argv_free(peers); } - /* get my locality string */ - val = NULL; - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, - OMPI_PROC_MY_NAME, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != val) { - mycpuset = val; - } else { - mycpuset = NULL; - } - i = 0; OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) { proc = cd->p; @@ -406,8 +396,8 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, val = NULL; OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, &proc->super.proc_name, &val, OPAL_STRING); - if (OPAL_SUCCESS == rc && NULL != val) { - u16 = opal_hwloc_compute_relative_locality(mycpuset, val); + if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { + u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); free(val); } else { /* all we can say is that it shares our node */ @@ -425,9 +415,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, } ++i; } - if (NULL != mycpuset) { - free(mycpuset); - } if (NULL != peer_ranks) { free(peer_ranks); } diff --git a/ompi/runtime/ompi_rte.c b/ompi/runtime/ompi_rte.c index 1b9a5db4da7..480c2252460 100644 --- a/ompi/runtime/ompi_rte.c +++ b/ompi/runtime/ompi_rte.c @@ -764,7 +764,7 @@ int ompi_rte_init(int *pargc, char ***pargv) /* identify our location */ val = NULL; - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_CPUSET, &opal_process_info.my_name, &val, PMIX_STRING); if (PMIX_SUCCESS == rc && NULL != val) { opal_process_info.cpuset = val; @@ -774,6 +774,15 @@ int ompi_rte_init(int *pargc, char ***pargv) opal_process_info.cpuset = NULL; opal_process_info.proc_is_bound = false; } + val = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, + &opal_process_info.my_name, &val, PMIX_STRING); + if (PMIX_SUCCESS == rc && NULL != val) { + opal_process_info.locality = val; + val = NULL; // protect the string + } else { + opal_process_info.locality = NULL; + } /* retrieve the local peers - defaults to local node */ val = NULL; @@ -811,7 +820,7 @@ int ompi_rte_init(int *pargc, char ***pargv) OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &pname, &val, PMIX_STRING); if (PMIX_SUCCESS == rc && NULL != val) { - u16 = opal_hwloc_compute_relative_locality(opal_process_info.cpuset, val); + u16 = opal_hwloc_compute_relative_locality(opal_process_info.locality, val); free(val); } else { /* all we can say is that it shares our node */ @@ -826,9 +835,6 @@ int ompi_rte_init(int *pargc, char ***pargv) ret = opal_pmix_convert_status(rc); error = "local store of locality"; opal_argv_free(peers); - if (NULL != opal_process_info.cpuset) { - free(opal_process_info.cpuset); - } goto error; } } diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 1b866560c79..97140b30dbd 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2020 Triad National Security, LLC. All rights @@ -345,7 +345,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) } // compute relative locality - relative_locality = opal_hwloc_compute_relative_locality(process_info->cpuset, locality_string); + relative_locality = opal_hwloc_compute_relative_locality(process_info->locality, locality_string); free(locality_string); if (relative_locality & OPAL_PROC_ON_SOCKET) { diff --git a/opal/util/proc.c b/opal/util/proc.c index 26973fdd619..06f35b8ef54 100644 --- a/opal/util/proc.c +++ b/opal/util/proc.c @@ -41,6 +41,7 @@ opal_process_info_t opal_process_info = { .my_local_rank = 0, /* I'm the only process around here */ .my_node_rank = 0, .cpuset = NULL, + .locality = NULL, .pid = 0, .num_procs = 0, .app_num = 0, diff --git a/opal/util/proc.h b/opal/util/proc.h index 785a6f7ec95..512f23b6bb8 100644 --- a/opal/util/proc.h +++ b/opal/util/proc.h @@ -115,6 +115,7 @@ typedef struct opal_process_info_t { uint16_t my_local_rank; /**< local rank on this node within my job */ uint16_t my_node_rank; char *cpuset; /**< String-representation of bitmap where we are bound */ + char *locality; /**< String-representation of process locality */ pid_t pid; uint32_t num_procs; uint32_t app_num;