Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate out the runtime controls options #1468

Merged
merged 1 commit into from
Aug 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions examples/bad_exit.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ int main(int argc, char **argv)
done:
if (0 == myproc.rank) {
exit(1);
} else {
sleep(3);
}
/* finalize us */
fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
Expand Down
68 changes: 38 additions & 30 deletions src/hwloc/hwloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "src/util/pmix_argv.h"
#include "src/util/output.h"
#include "src/util/pmix_show_help.h"
#include "src/util/prte_cmd_line.h"

/*
* Globals
Expand Down Expand Up @@ -531,7 +532,6 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
prte_binding_policy_t tmp;
char **quals, *myspec, *ptr;
prte_job_t *jdata = (prte_job_t *) jdat;
size_t len;

/* set default */
tmp = 0;
Expand All @@ -550,15 +550,17 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
++ptr;
quals = pmix_argv_split(ptr, ':');
for (i = 0; NULL != quals[i]; i++) {
len = strlen(quals[i]);
if (0 == strncasecmp(quals[i], "if-supported", len)) {
if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_IF_SUPP)) {
tmp |= PRTE_BIND_IF_SUPPORTED;
} else if (0 == strncasecmp(quals[i], "overload-allowed", len)) {

} else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_OVERLOAD)) {
tmp |= (PRTE_BIND_ALLOW_OVERLOAD | PRTE_BIND_OVERLOAD_GIVEN);
} else if (0 == strncasecmp(quals[i], "no-overload", len)) {

} else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_NOOVERLOAD)) {
tmp = (tmp & ~PRTE_BIND_ALLOW_OVERLOAD);
tmp |= PRTE_BIND_OVERLOAD_GIVEN;
} else if (0 == strncasecmp(quals[i], "REPORT", len)) {

} else if (PRTE_CHECK_CLI_OPTION(quals[i], PRTE_CLI_REPORT)) {
if (NULL == jdata) {
pmix_show_help("help-prte-rmaps-base.txt", "unsupported-default-modifier", true,
"binding policy", quals[i]);
Expand All @@ -567,6 +569,7 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
}
prte_set_attribute(&jdata->attributes, PRTE_JOB_REPORT_BINDINGS, PRTE_ATTR_GLOBAL,
NULL, PMIX_BOOL);

} else {
/* unknown option */
pmix_show_help("help-prte-hwloc-base.txt", "unrecognized-modifier", true, spec);
Expand All @@ -578,30 +581,35 @@ int prte_hwloc_base_set_binding_policy(void *jdat, char *spec)
pmix_argv_free(quals);
}

len = strlen(myspec);
if (0 < len) {
if (0 == strncasecmp(myspec, "none", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NONE);
} else if (0 == strncasecmp(myspec, "hwthread", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_HWTHREAD);
} else if (0 == strncasecmp(myspec, "core", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_CORE);
} else if (0 == strncasecmp(myspec, "l1cache", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L1CACHE);
} else if (0 == strncasecmp(myspec, "l2cache", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L2CACHE);
} else if (0 == strncasecmp(myspec, "l3cache", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L3CACHE);
} else if (0 == strncasecmp(myspec, "numa", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NUMA);
} else if (0 == strncasecmp(myspec, "package", len)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_PACKAGE);
} else {
pmix_show_help("help-prte-hwloc-base.txt", "invalid binding_policy", true, "binding",
spec);
free(myspec);
return PRTE_ERR_BAD_PARAM;
}
if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_NONE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NONE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_HWT)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_HWTHREAD);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_CORE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_CORE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L1CACHE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L1CACHE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L2CACHE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L2CACHE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_L3CACHE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_L3CACHE);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_NUMA)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_NUMA);

} else if (PRTE_CHECK_CLI_OPTION(myspec, PRTE_CLI_PACKAGE)) {
PRTE_SET_BINDING_POLICY(tmp, PRTE_BIND_TO_PACKAGE);

} else {
pmix_show_help("help-prte-hwloc-base.txt", "invalid binding_policy", true, "binding",
spec);
free(myspec);
return PRTE_ERR_BAD_PARAM;
}
free(myspec);

Expand Down
8 changes: 7 additions & 1 deletion src/mca/errmgr/dvm/errmgr_dvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ static void proc_errors(int fd, short args, void *cbdata)
prte_proc_state_t state = caddy->proc_state;
int i;
int32_t i32, *i32ptr;
bool flag;
bool *fptr = &flag;
PRTE_HIDE_UNUSED_PARAMS(fd, args);

PMIX_ACQUIRE_OBJECT(caddy);
Expand Down Expand Up @@ -679,6 +681,8 @@ static void proc_errors(int fd, short args, void *cbdata)
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(proc),
pptr->exit_code));
jdata->exit_code = pptr->exit_code;
PRTE_FLAG_UNSET(pptr, PRTE_PROC_FLAG_ALIVE);
jdata->num_terminated++;
/* track the number of non-zero exits */
i32 = 0;
i32ptr = &i32;
Expand All @@ -687,7 +691,9 @@ static void proc_errors(int fd, short args, void *cbdata)
++i32;
prte_set_attribute(&jdata->attributes, PRTE_JOB_NUM_NONZERO_EXIT, PRTE_ATTR_LOCAL, i32ptr,
PMIX_INT32);
if (prte_get_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) {
flag = true;
prte_get_attribute(&jdata->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void*)&fptr, PMIX_BOOL);
if (flag) {
if (!PRTE_FLAG_TEST(jdata, PRTE_JOB_FLAG_ABORTED)) {
jdata->state = PRTE_JOB_STATE_NON_ZERO_TERM;
/* point to the first rank to cause the problem */
Expand Down
12 changes: 8 additions & 4 deletions src/mca/odls/base/odls_base_default_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -1523,6 +1523,8 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
prte_job_t *jobdat;
prte_proc_state_t state = PRTE_PROC_STATE_WAITPID_FIRED;
prte_proc_t *cptr;
bool flag = false;
bool *fptr = &flag;
PRTE_HIDE_UNUSED_PARAMS(fd, sd);

prte_output_verbose(5, prte_odls_base_framework.framework_output,
Expand Down Expand Up @@ -1610,8 +1612,9 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
* felt it was non-normal - in this latter case, we do not
* require that the proc deregister before terminating
*/
if (0 != proc->exit_code &&
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) {
flag = false;
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void**)&fptr, PMIX_BOOL);
if (0 != proc->exit_code && flag) {
state = PRTE_PROC_STATE_TERM_NON_ZERO;
PRTE_OUTPUT_VERBOSE(
(5, prte_odls_base_framework.framework_output,
Expand Down Expand Up @@ -1674,8 +1677,9 @@ void prte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
* none of them will. This is considered acceptable. Still
* flag it as abnormal if the exit code was non-zero
*/
if (0 != proc->exit_code &&
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, NULL, PMIX_BOOL)) {
flag = false;
prte_get_attribute(&jobdat->attributes, PRTE_JOB_TERM_NONZERO_EXIT, (void**)&fptr, PMIX_BOOL);
if (0 != proc->exit_code && flag) {
state = PRTE_PROC_STATE_TERM_NON_ZERO;
} else {
state = PRTE_PROC_STATE_WAITPID_FIRED;
Expand Down
9 changes: 0 additions & 9 deletions src/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -1949,15 +1949,6 @@ int prte_plm_base_prted_append_basic_args(int *argc, char ***argv, char *ess, in
if (prte_allow_run_as_root) {
pmix_argv_append(argc, argv, "--allow-run-as-root");
}
if (prte_map_stddiag_to_stderr) {
pmix_argv_append(argc, argv, "--prtemca");
pmix_argv_append(argc, argv, "prte_map_stddiag_to_stderr");
pmix_argv_append(argc, argv, "1");
} else if (prte_map_stddiag_to_stdout) {
pmix_argv_append(argc, argv, "--prtemca");
pmix_argv_append(argc, argv, "prte_map_stddiag_to_stdout");
pmix_argv_append(argc, argv, "1");
}

/* the following is not an mca param */
if (NULL != getenv("PRTE_TEST_PRTED_SUICIDE")) {
Expand Down
5 changes: 5 additions & 0 deletions src/mca/rmaps/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ typedef struct {
* when the directive comes thru MCA param */
char *file;
hwloc_cpuset_t available, baseset; // scratch for binding calculation
bool abort_non_zero_exit; // default setting for aborting on non-zero proc exit
} prte_rmaps_base_t;

/**
Expand Down Expand Up @@ -125,6 +126,10 @@ PRTE_EXPORT int prte_rmaps_base_set_default_ranking(prte_job_t *jdata,
prte_rmaps_options_t *options);
PRTE_EXPORT int prte_rmaps_base_set_ranking_policy(prte_job_t *jdata, char *spec);

PRTE_EXPORT int prte_rmaps_base_set_default_rto(prte_job_t *jdata,
prte_rmaps_options_t *options);
PRTE_EXPORT int prte_rmaps_base_set_runtime_options(prte_job_t *jdata, char *spec);

PRTE_EXPORT void prte_rmaps_base_display_map(prte_job_t *jdata);
PRTE_EXPORT void prte_rmaps_base_report_bindings(prte_job_t *jdata,
prte_rmaps_options_t *options);
Expand Down
4 changes: 0 additions & 4 deletions src/mca/rmaps/base/help-prte-rmaps-base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -677,10 +677,6 @@ to the --map-by option (except where noted):
be bound to the first CPU in the list, the second proc shall be
bound to the second CPU, etc.)

- DONOTLAUNCH directs PRRTE to map but not launch the specified job.
This is provided to help explore possible process placement patterns
before actually starting execution.

Note that directives and qualifiers are case-insensitive
and can be shortened to the minimum number of characters
to uniquely identify them. Thus, "L1CACHE" can be given
Expand Down
Loading