LLNL · bgunnar5 · Sep 28, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,9 +5,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [unreleased]
+### Added
+- New reserved variables:
+  - `VLAUNCHER`: The same functionality as the `LAUNCHER` variable, but will substitute shell variables `MERLIN_NODES`, `MERLIN_PROCS`, `MERLIN_CORES`, and `MERLIN_GPUS` for nodes, procs, cores per task, and gpus
+
 ### Changed
 - Hardcoded Sphinx v5.3.0 requirement is now removed so we can use latest Sphinx
 
+### Fixed
+- A bug where the filenames in iterative workflows kept appending `.out`, `.partial`, or `.expanded` to the filenames stored in the `merlin_info/` subdirectory
+
 ## [1.10.3]
 ### Added
 - The *.conf regex for the recursive-include of the merlin server directory so that pip will add it to the wheel

diff --git a/docs/source/merlin_variables.rst b/docs/source/merlin_variables.rst
@@ -160,8 +160,9 @@ Reserved variables
 
             $(MERLIN_INFO)/*.expanded.yaml
 
-The ``LAUNCHER`` Variable
-+++++++++++++++++++++++++
+
+The ``LAUNCHER`` and ``VLAUNCHER`` Variables
++++++++++++++++++++++++++++++++++++++++++++++++
 
 ``$(LAUNCHER)`` is a special case of a reserved variable since it's value *can* be changed.
 It serves as an abstraction to launch a job with parallel schedulers like :ref:`slurm<slurm>`,
@@ -187,6 +188,54 @@ We can modify this to use the ``$(LAUNCHER)`` variable like so:
 
 In other words, the ``$(LAUNCHER)`` variable would become ``srun -N 1 -n 3``.
 
+Similarly, the ``$(VLAUNCHER)`` variable behaves similarly to the ``$(LAUNCHER)`` variable.
+The key distinction lies in its source of information. Instead of drawing certain configuration
+options from the ``run`` section of a step, it retrieves specific shell variables. These shell
+variables are automatically generated by Merlin when you include the ``$(VLAUNCHER)`` variable
+in a step command, but they can also be customized by the user. Currently, the following shell
+variables are:
+
+.. list-table:: VLAUNCHER Variables
+   :widths: 25 50 25
+   :header-rows: 1
+
+   * - Variable
+     - Description
+     - Default
+
+   * - ``${MERLIN_NODES}``
+     - The number of nodes
+     - 1
+
+   * - ``${MERLIN_PROCS}``
+     - The number of tasks/procs
+     - 1
+
+   * - ``${MERLIN_CORES}``
+     - The number of cores per task/proc
+     - 1
+
+   * - ``${MERLIN_GPUS}``
+     - The number of gpus per task/proc
+     - 0
+
+Let's say we have the following defined in our yaml file:
+
+.. code:: yaml
+
+    batch:
+        type: flux
+
+    run:
+        cmd: |
+          MERLIN_NODES=4
+          MERLIN_PROCS=2
+          MERLIN_CORES=8
+          MERLIN_GPUS=2
+          $(VLAUNCHER) python script.py
+
+The ``$(VLAUNCHER)`` variable would be substituted to ``flux run -N 4 -n 2 -c 8 -g 2``.
+
 User variables
 -------------------
 Variables defined by a specification file in the ``env`` section, as in this example:

diff --git a/merlin/spec/defaults.py b/merlin/spec/defaults.py
@@ -52,3 +52,11 @@
     "generate": {"cmd": "echo 'Insert sample-generating command here'"},
     "level_max_dirs": 25,
 }
+
+# Values of the form (step key to search for, default value if no step key found)
+VLAUNCHER_VARS = {
+    "MERLIN_NODES": ("nodes", 1),
+    "MERLIN_PROCS": ("procs", 1),
+    "MERLIN_CORES": ("cores per task", 1),
+    "MERLIN_GPUS": ("gpus", 0),
+}
diff --git a/merlin/spec/specification.py b/merlin/spec/specification.py
@@ -45,7 +45,7 @@
 from maestrowf.specification import YAMLSpecification
 
 from merlin.spec import all_keys, defaults
-from merlin.utils import repr_timedelta
+from merlin.utils import find_vlaunch_var, repr_timedelta
 
 
 LOG = logging.getLogger(__name__)
@@ -369,6 +369,17 @@ def process_spec_defaults(self):
         defaults.STUDY_STEP_RUN["shell"] = self.batch["shell"]
         for step in self.study:
             MerlinSpec.fill_missing_defaults(step["run"], defaults.STUDY_STEP_RUN)
+            # Insert VLAUNCHER specific variables if necessary
+            if "$(VLAUNCHER)" in step["run"]["cmd"]:
+                SHSET = ""
+                if "csh" in step["run"]["shell"]:
+                    SHSET = "set "
+                # We need to set default values for VLAUNCHER variables if they're not defined by the user
+                for vlaunch_var, vlaunch_val in defaults.VLAUNCHER_VARS.items():
+                    if not find_vlaunch_var(vlaunch_var.replace("MERLIN_", ""), step["run"]["cmd"], accept_no_matches=True):
+                        # Look for predefined nodes/procs/cores/gpus values in the step and default to those
+                        vlaunch_val = step["run"][vlaunch_val[0]] if vlaunch_val[0] in step["run"] else vlaunch_val[1]
+                        step["run"]["cmd"] = f"{SHSET}{vlaunch_var}={vlaunch_val}\n" + step["run"]["cmd"]
 
         # fill in missing merlin section defaults
         MerlinSpec.fill_missing_defaults(self.merlin, defaults.MERLIN["merlin"])

diff --git a/merlin/study/script_adapter.py b/merlin/study/script_adapter.py
@@ -42,12 +42,36 @@
 from maestrowf.utils import start_process
 
 from merlin.common.abstracts.enums import ReturnCode
-from merlin.utils import convert_timestring
+from merlin.utils import convert_timestring, find_vlaunch_var
 
 
 LOG = logging.getLogger(__name__)
 
 
+def setup_vlaunch(step_run: str, batch_type: str, gpu_config: bool) -> None:
+    """
+    Check for the VLAUNCHER keyword int the step run string, find
+    the MERLIN variables and configure VLAUNCHER.
+
+    :param `step_run`: the step.run command string
+    :param `batch_type`: the batch type string
+    :param `gpu_config`: bool to determin if gpus should be configured
+    :returns: None
+    """
+    if "$(VLAUNCHER)" in step_run["cmd"]:
+        step_run["cmd"] = step_run["cmd"].replace("$(VLAUNCHER)", "$(LAUNCHER)")
+
+        step_run["nodes"] = find_vlaunch_var("NODES", step_run["cmd"])
+        step_run["procs"] = find_vlaunch_var("PROCS", step_run["cmd"])
+        step_run["cores per task"] = find_vlaunch_var("CORES", step_run["cmd"])
+
+        if find_vlaunch_var("GPUS", step_run["cmd"]):
+            if gpu_config:
+                step_run["gpus"] = find_vlaunch_var("GPUS", step_run["cmd"])
+            else:
+                LOG.warning(f"Merlin does not yet have the ability to set GPUs per task with {batch_type}. Coming soon.")
+
+
 class MerlinLSFScriptAdapter(SlurmScriptAdapter):
     """
     A SchedulerScriptAdapter class for slurm blocking parallel launches,
@@ -156,6 +180,23 @@ def get_parallelize_command(self, procs, nodes=None, **kwargs):
 
         return " ".join(args)
 
+    def write_script(self, ws_path, step):
+        """
+        This will overwrite the write_script in method from Maestro's base ScriptAdapter
+        class but will eventually call it. This is necessary for the VLAUNCHER to work.
+
+        :param `ws_path`: the path to the workspace where we'll write the scripts
+        :param `step`: the Maestro StudyStep object containing info for our step
+        :returns: a tuple containing:
+                  - a boolean representing whether this step is to be scheduled or not
+                    - Merlin can ignore this
+                  - a path to the script for the cmd
+                  - a path to the script for the restart cmd
+        """
+        setup_vlaunch(step.run, "lsf", False)
+
+        return super().write_script(ws_path, step)
+
 
 class MerlinSlurmScriptAdapter(SlurmScriptAdapter):
     """
@@ -256,6 +297,23 @@ def get_parallelize_command(self, procs, nodes=None, **kwargs):
 
         return " ".join(args)
 
+    def write_script(self, ws_path, step):
+        """
+        This will overwrite the write_script in method from Maestro's base ScriptAdapter
+        class but will eventually call it. This is necessary for the VLAUNCHER to work.
+
+        :param `ws_path`: the path to the workspace where we'll write the scripts
+        :param `step`: the Maestro StudyStep object containing info for our step
+        :returns: a tuple containing:
+                  - a boolean representing whether this step is to be scheduled or not
+                    - Merlin can ignore this
+                  - a path to the script for the cmd
+                  - a path to the script for the restart cmd
+        """
+        setup_vlaunch(step.run, "slurm", False)
+
+        return super().write_script(ws_path, step)
+
 
 class MerlinFluxScriptAdapter(MerlinSlurmScriptAdapter):
     """
@@ -319,6 +377,23 @@ def time_format(self, val):
         """
         return convert_timestring(val, format_method="FSD")
 
+    def write_script(self, ws_path, step):
+        """
+        This will overwrite the write_script in method from Maestro's base ScriptAdapter
+        class but will eventually call it. This is necessary for the VLAUNCHER to work.
+
+        :param `ws_path`: the path to the workspace where we'll write the scripts
+        :param `step`: the Maestro StudyStep object containing info for our step
+        :returns: a tuple containing:
+                  - a boolean representing whether this step is to be scheduled or not
+                    - Merlin can ignore this
+                  - a path to the script for the cmd
+                  - a path to the script for the restart cmd
+        """
+        setup_vlaunch(step.run, "flux", True)
+
+        return super().write_script(ws_path, step)
+
 
 class MerlinScriptAdapter(LocalScriptAdapter):
     """

diff --git a/merlin/study/study.py b/merlin/study/study.py
@@ -127,7 +127,8 @@ def __init__(  # pylint: disable=R0913
 
     def _set_special_file_vars(self):
         """Setter for the orig, partial, and expanded file paths of a study."""
-        base_name = Path(self.filepath).stem
+        shortened_filepath = self.filepath.replace(".out", "").replace(".partial", "").replace(".expanded", "")
+        base_name = Path(shortened_filepath).stem
         self.special_vars["MERLIN_SPEC_ORIGINAL_TEMPLATE"] = os.path.join(
             self.info,
             base_name + ".orig.yaml",

diff --git a/merlin/utils.py b/merlin/utils.py
@@ -497,6 +497,26 @@ def contains_shell_ref(string):
     return False
 
 
+def find_vlaunch_var(vlaunch_var: str, step_cmd: str, accept_no_matches=False) -> str:
+    """
+    Given a variable used for VLAUNCHER and the step cmd value, find
+    the variable.
+
+    :param `vlaunch_var`: The name of the VLAUNCHER variable (without MERLIN_)
+    :param `step_cmd`: The string for the cmd of a step
+    :param `accept_no_matches`: If True, return None if we couldn't find the variable. Otherwise, raise an error.
+    :returns: the `vlaunch_var` variable or None
+    """
+    matches = list(re.findall(rf"^(?!#).*MERLIN_{vlaunch_var}=\d+", step_cmd, re.MULTILINE))
+
+    if matches:
+        return f"${{MERLIN_{vlaunch_var}}}"
+
+    if accept_no_matches:
+        return None
+    raise ValueError(f"VLAUNCHER used but could not find MERLIN_{vlaunch_var} in the step.")
+
+
 # Time utilities
 def convert_to_timedelta(timestr: Union[str, int]) -> timedelta:
     """Convert a timestring to a timedelta object.

diff --git a/tests/integration/test_definitions.py b/tests/integration/test_definitions.py
@@ -407,13 +407,81 @@ def define_tests():  # pylint: disable=R0914,R0915
         },
         "dry launch flux": {
             "cmds": f"{run} {flux} --dry --local --no-errors --vars N_SAMPLES=2 OUTPUT_PATH=./{OUTPUT_DIR}",
-            "conditions": StepFileHasRegex(
-                "runs",
-                "*/runs.slurm.sh",
-                "flux_test",
-                OUTPUT_DIR,
-                get_flux_cmd("flux", no_errors=True),
-            ),
+            "conditions": [
+                StepFileHasRegex(
+                    "runs",
+                    "*/runs.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    get_flux_cmd("flux", no_errors=True),
+                ),
+                ##################
+                # VLAUNCHER TESTS
+                ##################
+                StepFileHasRegex(
+                    "vlauncher_test",
+                    "vlauncher_test.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"flux run -n \$\{MERLIN_PROCS\} -N \$\{MERLIN_NODES\} -c \$\{MERLIN_CORES\}",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_step_defaults",
+                    "vlauncher_test_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_GPUS=1",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_step_defaults",
+                    "vlauncher_test_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_NODES=6",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_step_defaults",
+                    "vlauncher_test_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_PROCS=3",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_step_defaults",
+                    "vlauncher_test_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_CORES=2",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_no_step_defaults",
+                    "vlauncher_test_no_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_GPUS=0",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_no_step_defaults",
+                    "vlauncher_test_no_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_NODES=1",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_no_step_defaults",
+                    "vlauncher_test_no_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_PROCS=1",
+                ),
+                StepFileHasRegex(
+                    "vlauncher_test_no_step_defaults",
+                    "vlauncher_test_no_step_defaults.slurm.sh",
+                    "flux_test",
+                    OUTPUT_DIR,
+                    r"MERLIN_CORES=1",
+                ),
+            ],
             "run type": "local",
         },
         "dry launch lsf": {

diff --git a/tests/integration/test_specs/flux_test.yaml b/tests/integration/test_specs/flux_test.yaml
@@ -33,6 +33,34 @@ study:
     depends: [runs*]
     task_queue: flux_test
 
+- description: step that uses vlauncher
+  name: vlauncher_test
+  run:
+    cmd: |
+      MERLIN_NODES=6
+      MERLIN_PROCS=3
+      MERLIN_CORES=2
+      $(VLAUNCHER) echo "step that uses vlauncher"
+    task_queue: flux_test
+
+- description: test vlauncher step defaults
+  name: vlauncher_test_step_defaults
+  run:
+    cmd: |
+      $(VLAUNCHER) echo "test vlauncher step defaults"
+    task_queue: flux_test
+    nodes: 6
+    procs: 3
+    cores per task: 2
+    gpus: 1
+
+- description: test vlauncher no step defaults
+  name: vlauncher_test_no_step_defaults
+  run:
+    cmd: |
+      $(VLAUNCHER) echo "test vlauncher no step defaults"
+    task_queue: flux_test
+
 global.parameters:
   STUDY:
     label: STUDY.%%