From ca66c1ec3b7bda1973f2506c70c12fbc270fe123 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 23 Aug 2022 11:05:37 -0700 Subject: [PATCH] Separate out the runtime controls options We have been overloading the `--map-by` directive with runtime controls such as do-not-launch. This creates confusion as both developers and users lose track of what is shown to the user in help and other cmd line operations vs what PRRTE internally accepts. Reduce the confusion by making runtime options a first class citizen. Define a new `--runtime-options ` cmd line option that takes a comma-delimited list of directives. Update the help files to match. Signed-off-by: Ralph Castain --- examples/bad_exit.c | 2 + src/hwloc/hwloc.c | 68 +++-- src/mca/errmgr/dvm/errmgr_dvm.c | 8 +- src/mca/odls/base/odls_base_default_fns.c | 12 +- src/mca/plm/base/plm_base_launch_support.c | 9 - src/mca/rmaps/base/base.h | 5 + src/mca/rmaps/base/help-prte-rmaps-base.txt | 4 - src/mca/rmaps/base/rmaps_base_frame.c | 316 ++++++++++++-------- src/mca/rmaps/base/rmaps_base_map_job.c | 9 + src/mca/rmaps/rmaps_types.h | 1 + src/mca/schizo/base/help-schizo-cli.txt | 38 ++- src/mca/schizo/base/schizo_base_frame.c | 33 +- src/mca/schizo/prte/help-schizo-prterun.txt | 13 +- src/mca/schizo/prte/help-schizo-prun.txt | 48 +-- src/mca/schizo/prte/schizo_prte.c | 6 + src/mca/schizo/schizo.h | 4 + src/mca/state/base/state_base_fns.c | 35 ++- src/mca/state/prted/state_prted.c | 3 - src/prted/pmix/pmix_server_dyn.c | 20 +- src/runtime/prte_globals.c | 8 +- src/runtime/prte_globals.h | 6 - src/runtime/prte_mca_params.c | 34 --- src/runtime/prte_quit.c | 17 +- src/tools/prte/prte.c | 8 +- src/tools/prun/prun.c | 16 +- src/util/attr.c | 2 + src/util/attr.h | 1 + src/util/prte_cmd_line.h | 38 ++- 28 files changed, 452 insertions(+), 312 deletions(-) diff --git a/examples/bad_exit.c b/examples/bad_exit.c index 76f4cddfc9..440cf60a1a 100644 --- a/examples/bad_exit.c +++ b/examples/bad_exit.c @@ -126,6 +126,8 @@ int main(int argc, char **argv) done: if (0 == myproc.rank) { exit(1); + } else { + sleep(3); } /* finalize us */ fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank); diff --git a/src/hwloc/hwloc.c b/src/hwloc/hwloc.c index 4bc0a42a45..23811d8c30 100644 --- a/src/hwloc/hwloc.c +++ b/src/hwloc/hwloc.c @@ -24,6 +24,7 @@ #include "src/util/pmix_argv.h" #include "src/util/output.h" #include "src/util/pmix_show_help.h" +#include "src/util/prte_cmd_line.h" /* * Globals @@ -531,7 +532,6 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) prte_binding_policy_t tmp; char **quals, *myspec, *ptr; prte_job_t *jdata = (prte_job_t *) jdat; - size_t len; /* set default */ tmp = 0; @@ -550,15 +550,17 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) ++ptr; quals = pmix_argv_split(ptr, ':'); for (i = 0; NULL != quals[i]; i++) { - len = strlen(quals[i]); - if (0 == strncasecmp(quals[i], "if-supported", len)) { + if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_IF_SUPP)) { tmp |= PRTE_BIND_IF_SUPPORTED; - } else if (0 == strncasecmp(quals[i], "overload-allowed", len)) { + + } else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_OVERLOAD)) { tmp |= (PRTE_BIND_ALLOW_OVERLOAD | PRTE_BIND_OVERLOAD_GIVEN); - } else if (0 == strncasecmp(quals[i], "no-overload", len)) { + + } else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_NOOVERLOAD)) { tmp = (tmp & ~PRTE_BIND_ALLOW_OVERLOAD); tmp |= PRTE_BIND_OVERLOAD_GIVEN; - } else if (0 == strncasecmp(quals[i], "REPORT", len)) { + + } else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_REPORT)) { if (NULL == jdata) { pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true, "binding policy", quals[i]); @@ -567,6 +569,7 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) } prte_set_attribute(&jdata->attributes, PRTE_JOB_REPORT_BINDINGS, PRTE_ATTR_GLOBAL, NULL, PMIX_BOOL); + } else { /* unknown option */ pmix_show_help("help-prte-hwloc-base.txt", "unrecognized-modifier", true, spec); @@ -578,30 +581,35 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec) pmix_argv_free(quals); } - len = strlen(myspec); - if (0 < len) { - if (0 == strncasecmp(myspec, "none", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NONE); - } else if (0 == strncasecmp(myspec, "hwthread", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_HWTHREAD); - } else if (0 == strncasecmp(myspec, "core", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_CORE); - } else if (0 == strncasecmp(myspec, "l1cache", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L1CACHE); - } else if (0 == strncasecmp(myspec, "l2cache", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L2CACHE); - } else if (0 == strncasecmp(myspec, "l3cache", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L3CACHE); - } else if (0 == strncasecmp(myspec, "numa", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NUMA); - } else if (0 == strncasecmp(myspec, "package", len)) { - PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_PACKAGE); - } else { - pmix_show_help("help-prte-hwloc-base.txt", "invalid binding_policy", true, "binding", - spec); - free(myspec); - return PRTE_ERR_BAD_PARAM; - } + if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_NONE)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NONE); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_HWT)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_HWTHREAD); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_CORE)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_CORE); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L1CACHE)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L1CACHE); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L2CACHE)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L2CACHE); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L3CACHE)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L3CACHE); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_NUMA)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NUMA); + + } else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_PACKAGE)) { + PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_PACKAGE); + + } else { + pmix_show_help("help-prte-hwloc-base.txt", "invalid binding_policy", true, "binding", + spec); + free(myspec); + return PRTE_ERR_BAD_PARAM; } free(myspec); diff --git a/src/mca/errmgr/dvm/errmgr_dvm.c b/src/mca/errmgr/dvm/errmgr_dvm.c index 9e584815a2..c5731dbd2d 100644 --- a/src/mca/errmgr/dvm/errmgr_dvm.c +++ b/src/mca/errmgr/dvm/errmgr_dvm.c @@ -378,6 +378,8 @@ static void proc_errors(int fd, short args, void *cbdata) prte_proc_state_t state = caddy->proc_state; int i; int32_t i32, *i32ptr; + bool flag; + bool *fptr = &flag; PRTE_HIDE_UNUSED_PARAMS(fd, args); PMIX_ACQUIRE_OBJECT(caddy); @@ -679,6 +681,8 @@ static void proc_errors(int fd, short args, void *cbdata) PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(proc), pptr->exit_code)); jdata->exit_code = pptr->exit_code; + PRTE_FLAG_UNSET(pptr, PRTE_PROC_FLAG_ALIVE); + jdata->num_terminated++; /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; @@ -687,7 +691,9 @@ static void proc_errors(int fd, short args, void *cbdata) ++i32; prte_set_attribute(&jdata->attributes, PRTE_JOB_NUM_NONZERO_EXIT, PRTE_ATTR_LOCAL, i32ptr, PMIX_INT32); - if (prte_get_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) { + flag = true; + prte_get_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void*)&fptr, PMIX_BOOL); + if (flag) { if (!PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_ABORTED)) { jdata->state = PRTE_JOB_STATE_NON_ZERO_TERM; /* point to the first rank to cause the problem */ diff --git a/src/mca/odls/base/odls_base_default_fns.c b/src/mca/odls/base/odls_base_default_fns.c index 6f210a0024..601170eb88 100644 --- a/src/mca/odls/base/odls_base_default_fns.c +++ b/src/mca/odls/base/odls_base_default_fns.c @@ -1523,6 +1523,8 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata) prte_job_t *jobdat; prte_proc_state_t state = PRTE_PROC_STATE_WAITPID_FIRED; prte_proc_t *cptr; + bool flag = false; + bool *fptr = &flag; PRTE_HIDE_UNUSED_PARAMS(fd, sd); prte_output_verbose(5, prte_odls_base_framework.framework_output, @@ -1610,8 +1612,9 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata) * felt it was non-normal - in this latter case, we do not * require that the proc deregister before terminating */ - if (0 != proc->exit_code && - prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) { + flag = false; + prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void**)&fptr, PMIX_BOOL); + if (0 != proc->exit_code && flag) { state = PRTE_PROC_STATE_TERM_NON_ZERO; PRTE_OUTPUT_VERBOSE( (5, prte_odls_base_framework.framework_output, @@ -1674,8 +1677,9 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata) * none of them will. This is considered acceptable. Still * flag it as abnormal if the exit code was non-zero */ - if (0 != proc->exit_code && - prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) { + flag = false; + prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void**)&fptr, PMIX_BOOL); + if (0 != proc->exit_code && flag) { state = PRTE_PROC_STATE_TERM_NON_ZERO; } else { state = PRTE_PROC_STATE_WAITPID_FIRED; diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index c323f51325..883bdacc00 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -1949,15 +1949,6 @@ int prte_plm_base_prted_append_basic_args(int *argc, char ***argv, char *ess, in if (prte_allow_run_as_root) { pmix_argv_append(argc, argv, "--allow-run-as-root"); } - if (prte_map_stddiag_to_stderr) { - pmix_argv_append(argc, argv, "--prtemca"); - pmix_argv_append(argc, argv, "prte_map_stddiag_to_stderr"); - pmix_argv_append(argc, argv, "1"); - } else if (prte_map_stddiag_to_stdout) { - pmix_argv_append(argc, argv, "--prtemca"); - pmix_argv_append(argc, argv, "prte_map_stddiag_to_stdout"); - pmix_argv_append(argc, argv, "1"); - } /* the following is not an mca param */ if (NULL != getenv("PRTE_TEST_PRTED_SUICIDE")) { diff --git a/src/mca/rmaps/base/base.h b/src/mca/rmaps/base/base.h index f8f2269670..32589a6375 100644 --- a/src/mca/rmaps/base/base.h +++ b/src/mca/rmaps/base/base.h @@ -76,6 +76,7 @@ typedef struct { * when the directive comes thru MCA param */ char *file; hwloc_cpuset_t available, baseset; // scratch for binding calculation + bool abort_non_zero_exit; // default setting for aborting on non-zero proc exit } prte_rmaps_base_t; /** @@ -125,6 +126,10 @@ PRTE_EXPORT int prte_rmaps_base_set_default_ranking(prte_job_t *jdata, prte_rmaps_options_t *options); PRTE_EXPORT int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec); +PRTE_EXPORT int prte_rmaps_base_set_default_rto(prte_job_t *jdata, + prte_rmaps_options_t *options); +PRTE_EXPORT int prte_rmaps_base_set_runtime_options(prte_job_t *jdata, char *spec); + PRTE_EXPORT void prte_rmaps_base_display_map(prte_job_t *jdata); PRTE_EXPORT void prte_rmaps_base_report_bindings(prte_job_t *jdata, prte_rmaps_options_t *options); diff --git a/src/mca/rmaps/base/help-prte-rmaps-base.txt b/src/mca/rmaps/base/help-prte-rmaps-base.txt index acd7237acc..c9b0aa4e2e 100644 --- a/src/mca/rmaps/base/help-prte-rmaps-base.txt +++ b/src/mca/rmaps/base/help-prte-rmaps-base.txt @@ -677,10 +677,6 @@ to the --map-by option (except where noted): be bound to the first CPU in the list, the second proc shall be bound to the second CPU, etc.) -- DONOTLAUNCH directs PRRTE to map but not launch the specified job. - This is provided to help explore possible process placement patterns - before actually starting execution. - Note that directives and qualifiers are case-insensitive and can be shortened to the minimum number of characters to uniquely identify them. Thus, "L1CACHE" can be given diff --git a/src/mca/rmaps/base/rmaps_base_frame.c b/src/mca/rmaps/base/rmaps_base_frame.c index 5275f67f1a..d25bfe0c8e 100644 --- a/src/mca/rmaps/base/rmaps_base_frame.c +++ b/src/mca/rmaps/base/rmaps_base_frame.c @@ -70,6 +70,7 @@ prte_rmaps_base_t prte_rmaps_base = { static char *rmaps_base_mapping_policy = NULL; static char *rmaps_base_ranking_policy = NULL; static bool rmaps_base_inherit = false; +static bool rmaps_base_abort_non_zero_exit = true; static int prte_rmaps_base_register(prte_mca_base_register_flag_t flags) { @@ -105,6 +106,15 @@ static int prte_rmaps_base_register(prte_mca_base_register_flag_t flags) PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9, PRTE_MCA_BASE_VAR_SCOPE_READONLY, &rmaps_base_inherit); + /* set some default job controls - since they get resolved in + prte_rmaps_base_map_job, we put them here */ + rmaps_base_abort_non_zero_exit = true; + (void) prte_mca_base_var_register("prte", "prte", NULL, "abort_on_non_zero_status", + "Set default policy for aborting the job if any process returns a non-zero exit status", + PRTE_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, + PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9, + PRTE_MCA_BASE_VAR_SCOPE_READONLY, &rmaps_base_abort_non_zero_exit); + return PRTE_SUCCESS; } @@ -137,6 +147,7 @@ static int prte_rmaps_base_open(prte_mca_base_open_flag_t flags) prte_rmaps_base.ranking = 0; prte_rmaps_base.inherit = rmaps_base_inherit; prte_rmaps_base.hwthread_cpus = false; + prte_rmaps_base.abort_non_zero_exit = rmaps_base_abort_non_zero_exit; if (NULL == prte_set_slots) { prte_set_slots = strdup("core"); } @@ -221,22 +232,6 @@ static int check_modifiers(char *ck, prte_job_t *jdata, prte_mapping_policy_t *t PRTE_SET_MAPPING_DIRECTIVE(*tmp, PRTE_MAPPING_SUBSCRIBE_GIVEN); nooversubscribe_given = true; - } else if (PRTE_CHECK_CLI_OPTION(ck2[i], PRTE_CLI_NOLAUNCH)) { - if (NULL == jdata) { - pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true, - "mapping policy", ck2[i]); - return PRTE_ERR_SILENT; - } - prte_set_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, PRTE_ATTR_GLOBAL, NULL, - PMIX_BOOL); - /* if we are not in a persistent DVM, then make sure we don't try to launch - * the daemons either */ - if (!prte_persistent) { - djob = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); - prte_set_attribute(&djob->attributes, PRTE_JOB_DO_NOT_LAUNCH, PRTE_ATTR_GLOBAL, - NULL, PMIX_BOOL); - } - } else if (PRTE_CHECK_CLI_OPTION(ck2[i], PRTE_CLI_NOLOCAL)) { PRTE_SET_MAPPING_DIRECTIVE(*tmp, PRTE_MAPPING_NO_USE_LOCAL); @@ -372,17 +367,16 @@ int prte_rmaps_base_set_default_mapping(prte_job_t *jdata, __LINE__); PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYHWTHREAD); } else { - if(PRTE_BIND_TO_NONE != PRTE_GET_BINDING_POLICY(jdata->map->binding)) { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps[%d] mapping not given - using bycore", __LINE__); - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYCORE); - } - else { - prte_output_verbose(5, prte_rmaps_base_framework.framework_output, - "mca:rmaps[%d] mapping not given - using byslot (bind = NONE)", - __LINE__); - PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); - } + if (PRTE_BIND_TO_NONE != PRTE_GET_BINDING_POLICY(jdata->map->binding)) { + prte_output_verbose(5, prte_rmaps_base_framework.framework_output, + "mca:rmaps[%d] mapping not given - using bycore", __LINE__); + PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYCORE); + } else { + prte_output_verbose(5, prte_rmaps_base_framework.framework_output, + "mca:rmaps[%d] mapping not given - using byslot (bind = NONE)", + __LINE__); + PRTE_SET_MAPPING_POLICY(jdata->map->mapping, PRTE_MAPPING_BYSLOT); + } } } else { /* if NUMA is available, map by that */ @@ -509,113 +503,108 @@ int prte_rmaps_base_set_mapping_policy(prte_job_t *jdata, char *inspec) } } - len = strlen(spec); - if (0 < len) { - if (0 == strncasecmp(spec, "slot", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYSLOT); - } else if (0 == strncasecmp(spec, "node", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYNODE); - } else if (0 == strncasecmp(spec, "seq", len)) { - /* there are several mechanisms by which the file specifying - * the sequence can be passed, so not really feasible to check - * it here */ - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_SEQ); - } else if (0 == strncasecmp(spec, "core", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYCORE); - } else if (0 == strncasecmp(spec, "l1cache", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYL1CACHE); - } else if (0 == strncasecmp(spec, "l2cache", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYL2CACHE); - } else if (0 == strncasecmp(spec, "l3cache", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYL3CACHE); - } else if (0 == strncasecmp(spec, "numa", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYNUMA); - } else if (0 == strncasecmp(spec, "package", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYPACKAGE); - } else if (0 == strcasecmp(spec, "rankfile")) { - /* check that the file was given */ - if ((NULL == jdata && NULL == prte_rmaps_base.file) || - (NULL != jdata && !prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, NULL, PMIX_STRING))) { - pmix_show_help("help-prte-rmaps-base.txt", "rankfile-no-filename", true); - free(spec); - return PRTE_ERR_BAD_PARAM; - } - /* if they asked for rankfile and didn't specify one, but did - * provide one via MCA param, then use it */ - if (NULL != jdata) { - if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, NULL, PMIX_STRING)) { - if (NULL == prte_rmaps_base.file) { - /* also not allowed */ - pmix_show_help("help-prte-rmaps-base.txt", "rankfile-no-filename", true); - free(spec); - return PRTE_ERR_BAD_PARAM; - } - prte_set_attribute(&jdata->attributes, PRTE_JOB_FILE, PRTE_ATTR_GLOBAL, - prte_rmaps_base.file, PMIX_STRING); - } - } - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYUSER); - } else if (0 == strncasecmp(spec, "hwthread", len)) { - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYHWTHREAD); - /* if we are mapping processes to individual hwthreads, then - * we need to treat those hwthreads as separate cpus - */ - if (NULL == jdata) { - prte_rmaps_base.hwthread_cpus = true; - } else { - prte_set_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, PRTE_ATTR_GLOBAL, - NULL, PMIX_BOOL); - } - } else if (0 == strncasecmp(spec, "dist", len)) { - if (NULL == jdata) { - if (NULL == prte_rmaps_base.device) { - pmix_show_help("help-prte-rmaps-base.txt", "device-not-specified", true); - free(spec); - return PRTE_ERR_SILENT; - } - } else if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_DIST_DEVICE, NULL, - PMIX_STRING)) { - pmix_show_help("help-prte-rmaps-base.txt", "device-not-specified", true); - free(spec); - return PRTE_ERR_SILENT; - } - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYDIST); - } else if (0 == strncasecmp(spec, "PE-LIST=", 8)) { - if (NULL == jdata) { - pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-policy", true, - "mapping policy", spec); - free(spec); - return PRTE_ERR_SILENT; - } - ptr = strchr(spec, '='); // cannot be NULL as we checked for it - ptr++; // move past the equal sign - /* Verify the list is composed of numeric tokens */ - temp_parm = strdup(ptr); - temp_token = strtok(temp_parm, ","); - while (NULL != temp_token) { - u16 = strtol(temp_token, &parm_delimiter, 10); - if ('\0' != *parm_delimiter) { - pmix_show_help("help-prte-rmaps-base.txt", "invalid-value", true, - "mapping policy", "PE-LIST", ptr); + if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_SLOT)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYSLOT); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_NODE)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYNODE); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_SEQ)) { + /* there are several mechanisms by which the file specifying + * the sequence can be passed, so not really feasible to check + * it here */ + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_SEQ); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_CORE)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYCORE); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_L1CACHE)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYL1CACHE); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_L2CACHE)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYL2CACHE); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_L3CACHE)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYL3CACHE); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_NUMA)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYNUMA); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_PACKAGE)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYPACKAGE); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_RANKFILE)) { + /* check that the file was given */ + if ((NULL == jdata && NULL == prte_rmaps_base.file) || + (NULL != jdata && !prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, NULL, PMIX_STRING))) { + pmix_show_help("help-prte-rmaps-base.txt", "rankfile-no-filename", true); + free(spec); + return PRTE_ERR_BAD_PARAM; + } + /* if they asked for rankfile and didn't specify one, but did + * provide one via MCA param, then use it */ + if (NULL != jdata) { + if (!prte_get_attribute(&jdata->attributes, PRTE_JOB_FILE, NULL, PMIX_STRING)) { + if (NULL == prte_rmaps_base.file) { + /* also not allowed */ + pmix_show_help("help-prte-rmaps-base.txt", "rankfile-no-filename", true); free(spec); - free(temp_parm); - return PRTE_ERR_SILENT; + return PRTE_ERR_BAD_PARAM; } - temp_token = strtok(NULL, ","); + prte_set_attribute(&jdata->attributes, PRTE_JOB_FILE, PRTE_ATTR_GLOBAL, + prte_rmaps_base.file, PMIX_STRING); } - free(temp_parm); - prte_set_attribute(&jdata->attributes, PRTE_JOB_CPUSET, PRTE_ATTR_GLOBAL, - ptr, PMIX_STRING); - PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_PELIST); - PRTE_SET_MAPPING_DIRECTIVE(tmp, PRTE_MAPPING_GIVEN); + } + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYUSER); + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_HWT)) { + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_BYHWTHREAD); + /* if we are mapping processes to individual hwthreads, then + * we need to treat those hwthreads as separate cpus + */ + if (NULL == jdata) { + prte_rmaps_base.hwthread_cpus = true; } else { - pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", - true, "mapping", spec); + prte_set_attribute(&jdata->attributes, PRTE_JOB_HWT_CPUS, PRTE_ATTR_GLOBAL, + NULL, PMIX_BOOL); + } + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_PELIST)) { + if (NULL == jdata) { + pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-policy", true, + "mapping policy", spec); free(spec); return PRTE_ERR_SILENT; } + ptr = strchr(spec, '='); // cannot be NULL as we checked for it + ptr++; // move past the equal sign + /* Verify the list is composed of numeric tokens */ + temp_parm = strdup(ptr); + temp_token = strtok(temp_parm, ","); + while (NULL != temp_token) { + u16 = strtol(temp_token, &parm_delimiter, 10); + if ('\0' != *parm_delimiter) { + pmix_show_help("help-prte-rmaps-base.txt", "invalid-value", true, + "mapping policy", "PE-LIST", ptr); + free(spec); + free(temp_parm); + return PRTE_ERR_SILENT; + } + temp_token = strtok(NULL, ","); + } + free(temp_parm); + prte_set_attribute(&jdata->attributes, PRTE_JOB_CPUSET, PRTE_ATTR_GLOBAL, + ptr, PMIX_STRING); + PRTE_SET_MAPPING_POLICY(tmp, PRTE_MAPPING_PELIST); PRTE_SET_MAPPING_DIRECTIVE(tmp, PRTE_MAPPING_GIVEN); + + } else { + pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", + true, "mapping", spec); + free(spec); + return PRTE_ERR_SILENT; } + PRTE_SET_MAPPING_DIRECTIVE(tmp, PRTE_MAPPING_GIVEN); setpolicy: if (NULL != spec) { @@ -646,7 +635,6 @@ int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec) { prte_ranking_policy_t tmp; size_t len; - /* set default */ tmp = 0; @@ -654,20 +642,24 @@ int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec) /* if mapping by-node, then default to rank-by node */ if (PRTE_MAPPING_BYNODE == PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NODE); + } else if (PRTE_MAPPING_PPR != PRTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* default to by-slot */ PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SLOT); } } else { - len = strlen(spec); - if (0 == strncasecmp(spec, "slot", len)) { + if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_SLOT)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SLOT); - } else if (0 == strncasecmp(spec, "node", len)) { + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_NODE)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_NODE); - } else if (0 == strncasecmp(spec, "fill", len)) { + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_FILL)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_FILL); - } else if (0 == strncasecmp(spec, "span", len)) { + + } else if (PRTE_CHECK_CLI_OPTION(spec, PRTE_CLI_SPAN)) { PRTE_SET_RANKING_POLICY(tmp, PRTE_RANK_BY_SPAN); + } else { pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, "ranking", spec); @@ -688,3 +680,65 @@ int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec) return PRTE_SUCCESS; } + +int prte_rmaps_base_set_default_rto(prte_job_t *jdata, + prte_rmaps_options_t *options) +{ + int rc; + rc = prte_rmaps_base_set_runtime_options(jdata, NULL); + return rc; +} + +int prte_rmaps_base_set_runtime_options(prte_job_t *jdata, char *spec) +{ + char **options, *ptr; + int n; + bool flag; + prte_job_t *djob; + + if (NULL == spec) { + /* set everything to the defaults */ + prte_add_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, PRTE_ATTR_GLOBAL, + &prte_rmaps_base.abort_non_zero_exit, PMIX_BOOL); + } else { + options = pmix_argv_split(spec, ','); + for (n=0; NULL != options[n]; n++) { + /* see if there is an '=' */ + ptr = strchr(options[n], '='); + if (NULL != ptr) { + *ptr = '\0'; + ++ptr; + if ('\0' == *ptr) { + /* missing the value */ + pmix_show_help("help-prte-rmaps-base.txt", "missing-value", true, + "runtime options", options[n], "empty"); + } + pmix_argv_free(options); + return PRTE_ERR_BAD_PARAM; + } + /* check the options */ + if (PRTE_CHECK_CLI_OPTION(options[n], PRTE_CLI_ABORT_NZ)) { + flag = PRTE_CHECK_TRUE(ptr); + prte_add_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); + } else if (PRTE_CHECK_CLI_OPTION(options[n], PRTE_CLI_NOLAUNCH)) { + flag = PRTE_CHECK_TRUE(ptr); + prte_add_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); + /* if we are not in a persistent DVM, then make sure we also + * apply this to the daemons */ + if (!prte_persistent) { + djob = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace); + prte_set_attribute(&djob->attributes, PRTE_JOB_DO_NOT_LAUNCH, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); + } + } else { + pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, + "runtime options", spec); + return PRTE_ERR_SILENT; + } + } + pmix_argv_free(options); + } + return PRTE_SUCCESS; +} diff --git a/src/mca/rmaps/base/rmaps_base_map_job.c b/src/mca/rmaps/base/rmaps_base_map_job.c index fd543223b7..a61dd0bdcf 100644 --- a/src/mca/rmaps/base/rmaps_base_map_job.c +++ b/src/mca/rmaps/base/rmaps_base_map_job.c @@ -103,6 +103,15 @@ void prte_rmaps_base_map_job(int fd, short args, void *cbdata) memset(&options, 0, sizeof(prte_rmaps_options_t)); options.stream = prte_rmaps_base_framework.framework_output; options.verbosity = 5; // usual value for base-level functions + if (!jdata->map->rtos_set) { + /* set the runtime options first */ + if (NULL != schizo->set_default_rto) { + rc = schizo->set_default_rto(jdata, &options); + } else { + rc = prte_rmaps_base_set_default_rto(jdata, &options); + } + } + /* check and set some general options */ if (prte_get_attribute(&jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) { options.donotlaunch = true; } diff --git a/src/mca/rmaps/rmaps_types.h b/src/mca/rmaps/rmaps_types.h index 765580f85b..a46af7d67b 100644 --- a/src/mca/rmaps/rmaps_types.h +++ b/src/mca/rmaps/rmaps_types.h @@ -57,6 +57,7 @@ struct prte_job_map_t { prte_mapping_policy_t mapping; prte_ranking_policy_t ranking; prte_binding_policy_t binding; + bool rtos_set; /* *** */ /* number of new daemons required to be launched * to support this job map diff --git a/src/mca/schizo/base/help-schizo-cli.txt b/src/mca/schizo/base/help-schizo-cli.txt index 7eea538d5f..a93394369a 100644 --- a/src/mca/schizo/base/help-schizo-cli.txt +++ b/src/mca/schizo/base/help-schizo-cli.txt @@ -29,7 +29,8 @@ Supported values include: processes in this job that includes local and node ranks, assigned bindings, and other data -- TOPO displays the topology of the nodes allocated to the job +- TOPO=LIST displays the topology of each node in the comma-delimited list + that is allocated to the job No qualifiers are defined for this directive. # @@ -447,3 +448,38 @@ application compiled with Open MPI, "mpich" for one built against the MPICH library, or "oshmem" for an OpenSHMEM application compiled against SUNY's reference library. # +# RUNTIME-OPTIONS +# +[runtime-options] +The "runtime-options" command line directive must be accompanied by a +comma-delimited list of case-insensitive options that control the runtime +behavior of the job. The full directive need not be provided - only enough +characters are required to uniquely identify the directive. + +Runtime options are typically "true" or "false", though this is not a +requirement on developers. Since the value of each option may need to +be set (e.g., to override a default set by MCA parameter), the syntax +of the command line directive includes the use of an '=' character to +allow inclusion of a value for the option. For example, one can set +the ABORT-NONZERO-STATUS option to "true" by specifying it as +"ABORT-NONZERO-STATUS=1". Note that boolean options can be set to "true" +using a non-zero integer or a case-insensitive string of the word "true". +For the latter representation, the user need only provide at least the +'T' character. The same policy applies to setting a boolean option to +"false". + +Note that a boolean option will default to "true" if provided without +a value. Thus, "--runtime-options abort-nonzero" is sufficient to set the +"ABORT-NONZERO-STATUS" option to "true". + +Supported values include: + +- ABORT-NONZERO-STATUS[=(bool)] directs the runtime to not abort a running + job if a process exits with non-zero status if set to true. + +- DONOTLAUNCH directs the runtime to map but not launch the specified + job. This is provided to help explore possible process placement patterns + before actually starting execution. + +The runtime-options command line option has no qualifiers. Note that directives +are case-insensitive. diff --git a/src/mca/schizo/base/schizo_base_frame.c b/src/mca/schizo/base/schizo_base_frame.c index afd880af1d..00e81edb6c 100644 --- a/src/mca/schizo/base/schizo_base_frame.c +++ b/src/mca/schizo/base/schizo_base_frame.c @@ -299,6 +299,12 @@ static int check_ndirs(pmix_cli_item_t *opt) return PRTE_SUCCESS; } +/* the sanity checker is provided for DEVELOPERS as it checks that + * the options contained in the cmd line being passed to PRRTE for + * execution meet PRRTE requirements. Although it does emit + * show_help messages, it really isn't intended for USERS - any + * problems in translating user cmd lines to PRRTE internal + * structs should be worked out by the developers */ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) { pmix_cli_item_t *opt, *newopt; @@ -316,7 +322,6 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) PRTE_CLI_PACKAGE, PRTE_CLI_NODE, PRTE_CLI_SEQ, -// PRTE_CLI_DIST, PRTE_CLI_PPR, PRTE_CLI_RANKFILE, PRTE_CLI_PELIST, @@ -330,11 +335,9 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) PRTE_CLI_NOLOCAL, PRTE_CLI_HWTCPUS, PRTE_CLI_CORECPUS, -// PRTE_CLI_DEVICE, PRTE_CLI_INHERIT, PRTE_CLI_NOINHERIT, PRTE_CLI_QFILE, - PRTE_CLI_NOLAUNCH, PRTE_CLI_ORDERED, NULL }; @@ -393,6 +396,12 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) NULL }; + char *rtos[] = { + PRTE_CLI_ABORT_NZ, + PRTE_CLI_NOLAUNCH, + NULL + }; + if (1 < pmix_cmd_line_get_ninsts(cmd_line, PRTE_CLI_MAPBY)) { pmix_show_help("help-schizo-base.txt", "multi-instances", true, PRTE_CLI_MAPBY); return PRTE_ERR_SILENT; @@ -405,6 +414,14 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) pmix_show_help("help-schizo-base.txt", "multi-instances", true, PRTE_CLI_BINDTO); return PRTE_ERR_SILENT; } + if (1 < pmix_cmd_line_get_ninsts(cmd_line, PRTE_CLI_DISPLAY)) { + pmix_show_help("help-schizo-base.txt", "multi-instances", true, PRTE_CLI_DISPLAY); + return PRTE_ERR_SILENT; + } + if (1 < pmix_cmd_line_get_ninsts(cmd_line, PRTE_CLI_RTOS)) { + pmix_show_help("help-schizo-base.txt", "multi-instances", true, PRTE_CLI_RTOS); + return PRTE_ERR_SILENT; + } /* check for synonyms */ PMIX_LIST_FOREACH(opt, &cmd_line->instances, pmix_cli_item_t) { @@ -468,7 +485,15 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) } } - // check too many directives + opt = pmix_cmd_line_get_param(cmd_line, PRTE_CLI_RTOS); + if (NULL != opt) { + for (n=0; NULL != opt->values[n]; n++) { + if (!prte_schizo_base_check_directives(PRTE_CLI_RTOS, rtos, NULL, opt->values[n])) { + return PRTE_ERR_SILENT; + } + } + } + // check too many values given to a single command line option PMIX_LIST_FOREACH(opt, &cmd_line->instances, pmix_cli_item_t) { rc = check_ndirs(opt); if (PRTE_SUCCESS != rc) { diff --git a/src/mca/schizo/prte/help-schizo-prterun.txt b/src/mca/schizo/prte/help-schizo-prterun.txt index 510f13776e..b03c174770 100644 --- a/src/mca/schizo/prte/help-schizo-prterun.txt +++ b/src/mca/schizo/prte/help-schizo-prterun.txt @@ -79,14 +79,14 @@ option to the help request as "--help