Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make slurm execute script customizable #834

Merged
68 changes: 49 additions & 19 deletions lib/ramble/ramble/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -2300,24 +2300,46 @@ def _object_templates(self, workspace):
"""Return templates defined from different objects associated with the app_inst"""
run_dir = self.expander.experiment_run_dir
replacements = workspace.workspace_paths()
expander = self.expander

def _expand_path(path):
return ramble.util.path.substitute_path_variables(
expander.expand_var(path), local_replacements=replacements
)

def _get_template_config(obj, tpl_config, obj_type):
# Search up the object chain to resolve source path
found = False
object_paths = [e[1] for e in ramble.repository.list_object_files(obj, obj_type)]
src_name = tpl_config["src_name"]
for obj_path in object_paths:
src_path = os.path.join(os.path.dirname(obj_path), src_name)
if os.path.isfile(src_path):
found = True
break
if not found:
raise ApplicationError(f"Object {obj.name} is missing template file at {src_path}")
# Resolve the source path
src_path_config = _expand_path(tpl_config["src_path"])
if not os.path.isabs(src_path_config):
# Search up the object chain to resolve source path
found = False
object_paths = [e[1] for e in ramble.repository.list_object_files(obj, obj_type)]
searched_paths = []
for obj_path in object_paths:
src_path = os.path.join(os.path.dirname(obj_path), src_path_config)
if os.path.isfile(src_path):
found = True
break
searched_paths.append(src_path)
if not found:
raise ApplicationError(
f"Object {obj.name} is missing template file {src_path_config}. "
f"Searched paths: {searched_paths}"
)
else:
if not os.path.isfile(src_path_config):
raise ApplicationError(f"Template file {src_path_config} does not exist")
src_path = src_path_config

# Resolve the destination path
dest_path = ramble.util.path.substitute_path_variables(
tpl_config["dest_path"], local_replacements=replacements
)
tpl_ext = ".tpl"
dest_path_config = tpl_config["dest_path"]
if dest_path_config is None:
dest_path = os.path.basename(src_path)
if dest_path.endswith(tpl_ext):
dest_path = dest_path[: -len(tpl_ext)]
else:
dest_path = _expand_path(tpl_config["dest_path"])
if not os.path.isabs(dest_path):
dest_path = os.path.join(run_dir, dest_path)

Expand All @@ -2327,11 +2349,9 @@ def _get_template_config(obj, tpl_config, obj_type):
for tpl_conf in obj.templates.values():
yield _get_template_config(obj, tpl_conf, obj_type=obj_type)

def _render_object_templates(self, extra_vars, workspace):
def _render_object_templates(self, extra_vars_origin, workspace):
for obj, tpl_config in self._object_templates(workspace):
extra_vars = extra_vars.copy()
if callable(getattr(obj, "template_render_vars", None)):
extra_vars.update(obj.template_render_vars())
extra_vars = extra_vars_origin.copy()
src_path = tpl_config["src_path"]
with open(src_path) as f_in:
content = f_in.read()
Expand All @@ -2351,10 +2371,20 @@ def _render_object_templates(self, extra_vars, workspace):
os.chmod(out_path, perm)

def _define_object_template_vars(self, workspace):
for _, tpl_config in self._object_templates(workspace):
var_attr = {
"type": ramble.keywords.key_type.reserved,
"level": ramble.keywords.output_level.variable,
}
for obj, tpl_config in self._object_templates(workspace):
var_name = tpl_config["var_name"]
if var_name is not None:
self.variables[var_name] = tpl_config["dest_path"]
self.keywords.update_keys({var_name: var_attr})
if callable(getattr(obj, "template_render_vars", None)):
render_vars = obj.template_render_vars()
self.variables.update(render_vars)
for name in render_vars.keys():
self.keywords.update_keys({name: var_attr})

def _objects(self):
"""Return a tuple for each object instance associated with the app_inst.
Expand Down
20 changes: 12 additions & 8 deletions lib/ramble/ramble/language/shared_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,16 +492,16 @@ def _execute_target_shells(obj):
@shared_directive("templates")
def register_template(
name: str,
src_name: str,
dest_path: str,
src_path: str,
dest_path: Optional[str] = None,
define_var: bool = True,
extra_vars: Optional[dict] = None,
extra_vars_func: Optional[str] = None,
output_perm=None,
):
"""Directive to define an object-specific template to be rendered into experiment run_dir.

For instance, `register_template(name="foo", src_name="foo.tpl", dest_path="foo.sh")`
For instance, `register_template(name="foo", src_path="foo.tpl", dest_path="foo.sh")`
expects a "foo.tpl" template defined alongside the object source, and uses that to
render a file under "{experiment_run_dir}/foo.sh". The rendered path can also be
referenced with the `foo` variable name.
Expand All @@ -510,12 +510,16 @@ def register_template(
name: The name of the template. It is also used as the variable name
that an experiment can use to reference the rendered path, if
`define_var` is true.
src_name: The leaf name of the template. This is used to locate the
the template under the containing directory of the object.
dest_path: The location of the rendered output. It can either point
src_path: The location of the template. It can either point
to an absolute or a relative path. It knows how to resolve
workspace paths such as `$workspace_shared`. A relative path
is relative to the containing directory of the object source.
dest_path: If present, the location of the rendered output. It can either point
to an absolute or a relative path. It knows how to resolve
workspace paths such as `$workspace_shared`. A relative path
is relative to the `experiment_run_dir`.
is relative to the `experiment_run_dir`. If not given, it will
use the same name as the template (optionally drop the .tpl extension)
and placed under `experiment_run_dir`.
define_var: Controls if a variable named `name` should be defined.
extra_vars: If present, the variable dict is used as extra variables to
render the template.
Expand All @@ -530,7 +534,7 @@ def _define_template(obj):
var_name = name if define_var else None
extra_vars_func_name = f"_{extra_vars_func}" if extra_vars_func is not None else None
obj.templates[name] = {
"src_name": src_name,
"src_path": src_path,
"dest_path": dest_path,
"var_name": var_name,
"extra_vars": extra_vars,
Expand Down
7 changes: 6 additions & 1 deletion lib/ramble/ramble/test/end_to_end/test_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,18 @@ def test_template():
assert "echo not_exist" not in content
execute_path = os.path.join(run_dir, "execute_experiment")
script2_path = os.path.join(ws.shared_dir, "script.sh")
assert os.path.isfile(script2_path)
script3_path = os.path.join(run_dir, "expansion_script.sh")
script4_path = os.path.join(run_dir, "bar")
with open(execute_path) as f:
content = f.read()
assert script_path in content
# The workspace path should be expanded
assert "$workspace_shared" not in content
assert script2_path in content
assert script3_path in content
assert os.path.isfile(script2_path)
assert os.path.isfile(script3_path)
assert os.path.isfile(script4_path)


def test_template_inherited():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def test_slurm_workflow():
test_{wm_name}_2:
variables:
slurm_partition: h3
test_{wm_name}_3:
variables:
slurm_execute_template_path: $workspace_configs/execute_experiment.tpl
"""
with ramble.workspace.create(workspace_name) as ws:
ws.write()
Expand All @@ -68,7 +71,7 @@ def test_slurm_workflow():
# Assert on no workflow manager
path = os.path.join(ws.experiment_dir, "hostname", "local", "test_None")
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
assert "slurm_execute_experiment" not in files
assert "slurm_experiment_sbatch" not in files
assert "batch_submit" not in files
assert "batch_query" not in files
assert "batch_cancel" not in files
Expand All @@ -77,16 +80,15 @@ def test_slurm_workflow():
# Assert on slurm workflow manager
path = os.path.join(ws.experiment_dir, "hostname", "local", "test_slurm")
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
assert "slurm_execute_experiment" in files
assert "batch_submit" in files
assert "batch_query" in files
assert "batch_cancel" in files
assert "batch_wait" in files
with open(os.path.join(path, "batch_submit")) as f:
content = f.read()
assert "slurm_execute_experiment" in content
assert "slurm_experiment_sbatch" in content
assert ".slurm_job" in content
with open(os.path.join(path, "slurm_execute_experiment")) as f:
with open(os.path.join(path, "slurm_experiment_sbatch")) as f:
content = f.read()
assert "scontrol show hostnames" in content
assert "#SBATCH --gpus-per-task=1" in content
Expand All @@ -101,6 +103,15 @@ def test_slurm_workflow():

# Assert on the experiment with non-empty partition variable given
path = os.path.join(ws.experiment_dir, "hostname", "local", "test_slurm_2")
with open(os.path.join(path, "slurm_execute_experiment")) as f:
with open(os.path.join(path, "slurm_experiment_sbatch")) as f:
content = f.read()
assert "#SBATCH -p h3" in content

# Assert on the experiment with custom slurm execute template
path = os.path.join(ws.experiment_dir, "hostname", "local", "test_slurm_3")
assert not os.path.exists(os.path.join(path, "slurm_experiment_sbatch"))
with open(os.path.join(path, "execute_experiment")) as f:
content = f.read()
# Since it uses the default execute_experiment tpl, no slurm content is present
assert "#SBATCH" not in content
assert "scontrol" not in content
28 changes: 25 additions & 3 deletions var/ramble/repos/builtin.mock/applications/template/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ class Template(ExecutableApplication):

name = "template"

executable("foo", template=["bash {bar}", "echo {test}"])
executable(
"foo",
template=["bash {bar}", "echo {test}", "echo {expansion_test_path}"],
)

workload("test_template", executable="foo")

Expand All @@ -27,7 +30,7 @@ class Template(ExecutableApplication):

register_template(
name="bar",
src_name="bar.tpl",
src_path="bar.tpl",
dest_path="bar.sh",
# The `dynamic_hello_world` will be overridden by `_bar_vars`
extra_vars={
Expand All @@ -42,9 +45,28 @@ def _bar_vars(self):
val = expander.expand_var('"hello {hello_name}"')
return {"dynamic_hello_world": val}

register_template(
name="bar2",
src_path="bar.tpl",
)

register_template(
name="test",
src_name="script.sh",
src_path="script.sh",
dest_path="$workspace_shared/script.sh",
output_perm="755",
)

# Setup to test the path expansion for both src and dest
workload_variable(
"src_script_path",
default="$workspace_configs/execute_experiment.tpl",
description="source path of the template",
workload="test_template",
)

register_template(
name="expansion_test_path",
src_path="{src_script_path}",
dest_path="{experiment_run_dir}/expansion_script.sh",
)
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class Hpcg(ExecutableApplication):

register_template(
name="hpcg_dat",
src_name="hpcg.dat.tpl",
src_path="hpcg.dat.tpl",
dest_path="hpcg.dat",
define_var=False,
)
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def _isqrt(self, n):

register_template(
"hpl_dat",
src_name="HPL.dat.tpl",
src_path="HPL.dat.tpl",
dest_path="HPL.dat",
define_var=False,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
#!/bin/bash
sbatch {slurm_execute_experiment} | tee >(awk '{print $NF}' > {experiment_run_dir}/.slurm_job)
sbatch {slurm_experiment_sbatch} | tee >(awk '{print $NF}' > {experiment_run_dir}/.slurm_job)
Original file line number Diff line number Diff line change
Expand Up @@ -83,35 +83,42 @@ def __init__(self, file_path):
description="partition to submit job to, if unspecified, it uses the default partition",
)

workflow_manager_variable(
name="slurm_execute_template_path",
default="slurm_experiment_sbatch.tpl",
description="Path to the custom template for generating the slurm sbatch job script. "
"If a relative path is given, it is searched under the workflow manager's source directory. "
"The path can contain workspace path variables such as $workspace_config.",
)

register_template(
name="batch_submit",
src_name="batch_submit.tpl",
src_path="batch_submit.tpl",
dest_path="batch_submit",
)

register_template(
name="batch_query",
src_name="batch_query.tpl",
src_path="batch_query.tpl",
dest_path="batch_query",
extra_vars={"declare_status_map": _declare_status_map()},
)

register_template(
name="batch_cancel",
src_name="batch_cancel.tpl",
src_path="batch_cancel.tpl",
dest_path="batch_cancel",
)

register_template(
name="batch_wait",
src_name="batch_wait.tpl",
src_path="batch_wait.tpl",
dest_path="batch_wait",
)

register_template(
name="slurm_execute_experiment",
src_name="slurm_execute_experiment.tpl",
dest_path="slurm_execute_experiment",
name="slurm_experiment_sbatch",
src_path="{slurm_execute_template_path}",
)

def template_render_vars(self):
Expand Down
Loading