Skip to content

Commit

Permalink
Separate out the runtime controls options
Browse files Browse the repository at this point in the history
We have been overloading the `--map-by` directive with runtime
controls such as do-not-launch. This creates confusion as both
developers and users lose track of what is shown to the user
in help and other cmd line operations vs what PRRTE internally
accepts.

Reduce the confusion by making runtime options a first class
citizen. Define a new `--runtime-options <args>` cmd line
option that takes a comma-delimited list of directives. Update
the help files to match.

Signed-off-by: Ralph Castain <rhc@pmix.org>
  • Loading branch information
rhc54 committed Aug 24, 2022
1 parent 2d462be commit ca66c1e
Show file tree
Hide file tree
Showing 28 changed files with 452 additions and 312 deletions.
2 changes: 2 additions & 0 deletions examples/bad_exit.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ int main(int argc, char **argv)
done:
if (0 == myproc.rank) {
exit(1);
} else {
sleep(3);
}
/* finalize us */
fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
Expand Down
68 changes: 38 additions & 30 deletions src/hwloc/hwloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "src/util/pmix_argv.h"
#include "src/util/output.h"
#include "src/util/pmix_show_help.h"
#include "src/util/prte_cmd_line.h"

/*
* Globals
Expand Down Expand Up @@ -531,7 +532,6 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
prte_binding_policy_t tmp;
char **quals, *myspec, *ptr;
prte_job_t *jdata = (prte_job_t *) jdat;
size_t len;

/* set default */
tmp = 0;
Expand All @@ -550,15 +550,17 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
++ptr;
quals = pmix_argv_split(ptr, ':');
for (i = 0; NULL != quals[i]; i++) {
len = strlen(quals[i]);
if (0 == strncasecmp(quals[i], "if-supported", len)) {
if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_IF_SUPP)) {
tmp |= PRTE_BIND_IF_SUPPORTED;
} else if (0 == strncasecmp(quals[i], "overload-allowed", len)) {

} else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_OVERLOAD)) {
tmp |= (PRTE_BIND_ALLOW_OVERLOAD | PRTE_BIND_OVERLOAD_GIVEN);
} else if (0 == strncasecmp(quals[i], "no-overload", len)) {

} else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_NOOVERLOAD)) {
tmp = (tmp & ~PRTE_BIND_ALLOW_OVERLOAD);
tmp |= PRTE_BIND_OVERLOAD_GIVEN;
} else if (0 == strncasecmp(quals[i], "REPORT", len)) {

} else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_REPORT)) {
if (NULL == jdata) {
pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true,
"binding policy", quals[i]);
Expand All @@ -567,6 +569,7 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
}
prte_set_attribute(&jdata->attributes, PRTE_JOB_REPORT_BINDINGS, PRTE_ATTR_GLOBAL,
NULL, PMIX_BOOL);

} else {
/* unknown option */
pmix_show_help("help-prte-hwloc-base.txt", "unrecognized-modifier", true, spec);
Expand All @@ -578,30 +581,35 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
pmix_argv_free(quals);
}

len = strlen(myspec);
if (0 < len) {
if (0 == strncasecmp(myspec, "none", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NONE);
} else if (0 == strncasecmp(myspec, "hwthread", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_HWTHREAD);
} else if (0 == strncasecmp(myspec, "core", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_CORE);
} else if (0 == strncasecmp(myspec, "l1cache", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L1CACHE);
} else if (0 == strncasecmp(myspec, "l2cache", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L2CACHE);
} else if (0 == strncasecmp(myspec, "l3cache", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L3CACHE);
} else if (0 == strncasecmp(myspec, "numa", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NUMA);
} else if (0 == strncasecmp(myspec, "package", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_PACKAGE);
} else {
pmix_show_help("help-prte-hwloc-base.txt", "invalid binding_policy", true, "binding",
spec);
free(myspec);
return PRTE_ERR_BAD_PARAM;
}
if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_NONE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NONE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_HWT)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_HWTHREAD);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_CORE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_CORE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L1CACHE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L1CACHE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L2CACHE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L2CACHE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L3CACHE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L3CACHE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_NUMA)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NUMA);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_PACKAGE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_PACKAGE);

} else {
pmix_show_help("help-prte-hwloc-base.txt", "invalid binding_policy", true, "binding",
spec);
free(myspec);
return PRTE_ERR_BAD_PARAM;
}
free(myspec);

Expand Down
8 changes: 7 additions & 1 deletion src/mca/errmgr/dvm/errmgr_dvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ static void proc_errors(int fd, short args, void *cbdata)
prte_proc_state_t state = caddy->proc_state;
int i;
int32_t i32, *i32ptr;
bool flag;
bool *fptr = &flag;
PRTE_HIDE_UNUSED_PARAMS(fd, args);

PMIX_ACQUIRE_OBJECT(caddy);
Expand Down Expand Up @@ -679,6 +681,8 @@ static void proc_errors(int fd, short args, void *cbdata)
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(proc),
pptr->exit_code));
jdata->exit_code = pptr->exit_code;
PRTE_FLAG_UNSET(pptr, PRTE_PROC_FLAG_ALIVE);
jdata->num_terminated++;
/* track the number of non-zero exits */
i32 = 0;
i32ptr = &i32;
Expand All @@ -687,7 +691,9 @@ static void proc_errors(int fd, short args, void *cbdata)
++i32;
prte_set_attribute(&jdata->attributes, PRTE_JOB_NUM_NONZERO_EXIT, PRTE_ATTR_LOCAL, i32ptr,
PMIX_INT32);
if (prte_get_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) {
flag = true;
prte_get_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void*)&fptr, PMIX_BOOL);
if (flag) {
if (!PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_ABORTED)) {
jdata->state = PRTE_JOB_STATE_NON_ZERO_TERM;
/* point to the first rank to cause the problem */
Expand Down
12 changes: 8 additions & 4 deletions src/mca/odls/base/odls_base_default_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -1523,6 +1523,8 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
prte_job_t *jobdat;
prte_proc_state_t state = PRTE_PROC_STATE_WAITPID_FIRED;
prte_proc_t *cptr;
bool flag = false;
bool *fptr = &flag;
PRTE_HIDE_UNUSED_PARAMS(fd, sd);

prte_output_verbose(5, prte_odls_base_framework.framework_output,
Expand Down Expand Up @@ -1610,8 +1612,9 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
* felt it was non-normal - in this latter case, we do not
* require that the proc deregister before terminating
*/
if (0 != proc->exit_code &&
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) {
flag = false;
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void**)&fptr, PMIX_BOOL);
if (0 != proc->exit_code && flag) {
state = PRTE_PROC_STATE_TERM_NON_ZERO;
PRTE_OUTPUT_VERBOSE(
(5, prte_odls_base_framework.framework_output,
Expand Down Expand Up @@ -1674,8 +1677,9 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
* none of them will. This is considered acceptable. Still
* flag it as abnormal if the exit code was non-zero
*/
if (0 != proc->exit_code &&
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) {
flag = false;
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void**)&fptr, PMIX_BOOL);
if (0 != proc->exit_code && flag) {
state = PRTE_PROC_STATE_TERM_NON_ZERO;
} else {
state = PRTE_PROC_STATE_WAITPID_FIRED;
Expand Down
9 changes: 0 additions & 9 deletions src/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -1949,15 +1949,6 @@ int prte_plm_base_prted_append_basic_args(int *argc, char ***argv, char *ess, in
if (prte_allow_run_as_root) {
pmix_argv_append(argc, argv, "--allow-run-as-root");
}
if (prte_map_stddiag_to_stderr) {
pmix_argv_append(argc, argv, "--prtemca");
pmix_argv_append(argc, argv, "prte_map_stddiag_to_stderr");
pmix_argv_append(argc, argv, "1");
} else if (prte_map_stddiag_to_stdout) {
pmix_argv_append(argc, argv, "--prtemca");
pmix_argv_append(argc, argv, "prte_map_stddiag_to_stdout");
pmix_argv_append(argc, argv, "1");
}

/* the following is not an mca param */
if (NULL != getenv("PRTE_TEST_PRTED_SUICIDE")) {
Expand Down
5 changes: 5 additions & 0 deletions src/mca/rmaps/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ typedef struct {
* when the directive comes thru MCA param */
char *file;
hwloc_cpuset_t available, baseset; // scratch for binding calculation
bool abort_non_zero_exit; // default setting for aborting on non-zero proc exit
} prte_rmaps_base_t;

/**
Expand Down Expand Up @@ -125,6 +126,10 @@ PRTE_EXPORT int prte_rmaps_base_set_default_ranking(prte_job_t *jdata,
prte_rmaps_options_t *options);
PRTE_EXPORT int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec);

PRTE_EXPORT int prte_rmaps_base_set_default_rto(prte_job_t *jdata,
prte_rmaps_options_t *options);
PRTE_EXPORT int prte_rmaps_base_set_runtime_options(prte_job_t *jdata, char *spec);

PRTE_EXPORT void prte_rmaps_base_display_map(prte_job_t *jdata);
PRTE_EXPORT void prte_rmaps_base_report_bindings(prte_job_t *jdata,
prte_rmaps_options_t *options);
Expand Down
4 changes: 0 additions & 4 deletions src/mca/rmaps/base/help-prte-rmaps-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -677,10 +677,6 @@ to the --map-by option (except where noted):
be bound to the first CPU in the list, the second proc shall be
bound to the second CPU, etc.)

- DONOTLAUNCH directs PRRTE to map but not launch the specified job.
This is provided to help explore possible process placement patterns
before actually starting execution.

Note that directives and qualifiers are case-insensitive
and can be shortened to the minimum number of characters
to uniquely identify them. Thus, "L1CACHE" can be given
Expand Down
Loading

0 comments on commit ca66c1e

Please sign in to comment.