Merge pull request #460 from DiamondLightSource/issue-458

Add CLI flag to pass an existing output directory to httomo (instead of httomo creating one)
DiamondLightSource · Oct 9, 2024 · 7842bae · 7842bae
2 parents 069117b + 49f83e3
commit 7842bae
Show file tree

Hide file tree

Showing 3 changed files with 281 additions and 90 deletions.
diff --git a/docs/source/howto/run_httomo.rst b/docs/source/howto/run_httomo.rst
@@ -155,21 +155,37 @@ The :code:`run` command
       Run a pipeline defined in YAML on input data.
 
     Options:
-      --save-all                 Save intermediate datasets for all tasks in the
-                                 pipeline.
-      --gpu-id INTEGER           The GPU ID of the device to use.
-      --reslice-dir DIRECTORY    Directory for temporary files potentially needed
-                                 for reslicing (defaults to output dir)
-      --max-cpu-slices INTEGER   Maximum number of slices to use for a block for
-                                 CPU-only sections (default: 64)
-      --max-memory TEXT          Limit the amount of memory used by the pipeline
-                                 to the given memory (supports strings like 3.2G
-                                 or bytes)
-      --monitor TEXT             Add monitor to the runner (can be given multiple
-                                 times). Available monitors: bench, summary
-      --monitor-output FILENAME  File to store the monitoring output. Defaults to
-                                 '-', which denotes stdout
-      --help                     Show this message and exit.
+      --output-folder-name DIRECTORY  Define the name of the output folder created
+                                      by HTTomo
+      --output-folder-path DIRECTORY  Provide path to folder in which output
+                                      should be stored. This overrides the
+                                      `out_dir` argument
+      --save-all                      Save intermediate datasets for all tasks in
+                                      the pipeline.
+      --gpu-id INTEGER                The GPU ID of the device to use.
+      --reslice-dir DIRECTORY         Directory for temporary files potentially
+                                      needed for reslicing (defaults to output
+                                      dir)
+      --max-cpu-slices INTEGER        Maximum number of slices to use for a block
+                                      for CPU-only sections (default: 64)
+      --max-memory TEXT               Limit the amount of memory used by the
+                                      pipeline to the given memory (supports
+                                      strings like 3.2G or bytes)
+      --monitor TEXT                  Add monitor to the runner (can be given
+                                      multiple times). Available monitors: bench,
+                                      summary
+      --monitor-output FILENAME       File to store the monitoring output.
+                                      Defaults to '-', which denotes stdout
+      --intermediate-format [hdf5]    Write intermediate data in hdf5 format
+      --compress-intermediate         Write intermediate data in chunked format
+                                      with BLOSC compression applied
+      --syslog-host TEXT              Host of the syslog server
+      --syslog-port INTEGER           Port on the host the syslog server is
+                                      running on
+      --frames-per-chunk INTEGER RANGE
+                                      Number of frames per-chunk in intermediate
+                                      data (0 = write as contiguous)  [x>=0]
+      --help                          Show this message and exit.
 
 Arguments
 #########
@@ -210,17 +226,50 @@ directory created by HTTomo would be
 Options/flags
 #############
 
-The :code:`run` command has 6 options/flags:
+The :code:`run` command has 14 options/flags:
 
+- :code:`--output-folder-name`
+- :code:`--output-folder-path`
 - :code:`--save-all`
+- :code:`--gpu-id`
 - :code:`--reslice-dir`
 - :code:`--max-cpu-slices`
 - :code:`--max-memory`
 - :code:`--monitor`
 - :code:`--monitor-output`
+- :code:`--intermediate-format`
+- :code:`--compress-intermediate`
+- :code:`--syslog-host`
+- :code:`--syslog-port`
+- :code:`--frames-per-chunk`
 
 .. _httomo-saving:
 
+:code:`--output-folder-name`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As described in the documentation for the :code:`OUT_DIR` argument, the default name of the
+output directory created by HTTomo consists primarily of a timestamp. If one wishes to provide
+a name for the directory created by HTTomo instead of using the default timestamp name, then
+the :code:`--output-folder-name` flag may be used to achieve this.
+
+For example, if the :code:`OUT_DIR` path provided was :code:`/home/myuser`, and
+:code:`--output-folder-name=test-1` was given, then the absolute path of the output directory
+created by HTTomo would be :code:`/home/myuser/test-1/`.
+
+:code:`--output-folder-path`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If one wishes to have HTTomo *not* create an output directory, and instead for HTTomo to place
+the output in an existing directory, then the :code:`--output-folder-path` flag may be used.
+
+For example, if :code:`--output-folder-path=/home/myuser/my-output` was given, then HTTomo
+wouldn't create an output directory and would simply place the results in
+:code:`/home/myuser/my-output`.
+
+.. note:: This flag overrides the :code:`OUT_DIR` argument. Also, the given directory must
+          exist prior to running HTTomo
+
 :code:`--save-all`
 ~~~~~~~~~~~~~~~~~~
 
@@ -237,6 +286,11 @@ However, there are certain cases such as debugging, where saving the output of
 all methods to files in the output directory is beneficial. This flag is a quick
 way of doing so.
 
+:code:`--gpu-id`
+~~~~~~~~~~~~~~~~
+
+TODO
+
 :code:`--reslice-dir`
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -377,3 +431,28 @@ analysis.
 HTTomo supports writing the monitoring results in CSV format, and so any given
 filepath to the :code:`--monitor-output` flag will produce a file with the
 benchmarking results written in CSV format.
+
+:code:`--intermediate-format`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TODO
+
+:code:`--compress-intermediate`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TODO
+
+:code:`--syslog-host`
+~~~~~~~~~~~~~~~~~~~~~
+
+TODO
+
+:code:`--syslog-port`
+~~~~~~~~~~~~~~~~~~~~~
+
+TODO
+
+:code:`--frames-per-chunk`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TODO
diff --git a/httomo/cli.py b/httomo/cli.py
@@ -14,6 +14,7 @@
 from httomo.cli_utils import is_sweep_pipeline
 from httomo.logger import setup_logger
 from httomo.monitors import MONITORS_MAP, make_monitors
+from httomo.runner.pipeline import Pipeline
 from httomo.sweep_runner.param_sweep_runner import ParamSweepRunner
 from httomo.transform_layer import TransformLayer
 from httomo.yaml_checker import validate_yaml_config
@@ -61,11 +62,20 @@ def check(yaml_config: Path, in_data_file: Optional[Path] = None):
     type=click.Path(exists=True, file_okay=False, writable=True, path_type=Path),
 )
 @click.option(
-    "--create-folder",
+    "--output-folder-name",
     type=click.Path(exists=False, file_okay=False, writable=True, path_type=Path),
     default=None,
     help="Define the name of the output folder created by HTTomo",
 )
+@click.option(
+    "--output-folder-path",
+    type=click.Path(exists=True, file_okay=False, writable=True, path_type=Path),
+    default=None,
+    help=(
+        "Provide path to folder in which output should be stored. This overrides "
+        "the `out_dir` argument"
+    ),
+)
 @click.option(
     "--save-all",
     is_flag=True,
@@ -144,7 +154,8 @@ def run(
     in_data_file: Path,
     yaml_config: Path,
     out_dir: Path,
-    create_folder: Optional[Path],
+    output_folder_name: Optional[Path],
+    output_folder_path: Optional[Path],
     gpu_id: int,
     save_all: bool,
     reslice_dir: Union[Path, None],
@@ -159,68 +170,43 @@ def run(
     frames_per_chunk: int,
 ):
     """Run a pipeline defined in YAML on input data."""
-    if compress_intermediate:
-        frames_per_chunk = 1
-    httomo.globals.INTERMEDIATE_FORMAT = intermediate_format
-    httomo.globals.COMPRESS_INTERMEDIATE = compress_intermediate
-    httomo.globals.FRAMES_PER_CHUNK = frames_per_chunk
+    set_global_constants(
+        out_dir,
+        intermediate_format,
+        compress_intermediate,
+        frames_per_chunk,
+        max_cpu_slices,
+        syslog_host,
+        syslog_port,
+        output_folder_name,
+        output_folder_path,
+    )
 
     does_contain_sweep = is_sweep_pipeline(yaml_config)
     global_comm = MPI.COMM_WORLD
     method_wrapper_comm = global_comm if not does_contain_sweep else MPI.COMM_SELF
-    httomo.globals.SYSLOG_SERVER = syslog_host
-    httomo.globals.SYSLOG_PORT = syslog_port
 
-    # Define httomo.globals.run_out_dir in all MPI processes
-    if create_folder is None:
-        httomo.globals.run_out_dir = out_dir.joinpath(
-            f"{datetime.now().strftime('%d-%m-%Y_%H_%M_%S')}_output"
-        )
-    else:
-        httomo.globals.run_out_dir = out_dir.joinpath(create_folder)
-
-    # Various initialisation tasks
     if global_comm.rank == 0:
-        Path.mkdir(httomo.globals.run_out_dir, exist_ok=True)
-        copy(yaml_config, httomo.globals.run_out_dir)
-    setup_logger(Path(httomo.globals.run_out_dir))
+        initialise_output_directory(yaml_config)
 
-    # instantiate UiLayer class for pipeline build
-    init_UiLayer = UiLayer(yaml_config, in_data_file, comm=method_wrapper_comm)
-    pipeline = init_UiLayer.build_pipeline()
+    setup_logger(Path(httomo.globals.run_out_dir))
 
-    # perform transformations on pipeline
-    tr = TransformLayer(comm=method_wrapper_comm, save_all=save_all)
-    pipeline = tr.transform(pipeline)
+    pipeline = generate_pipeline(
+        in_data_file, yaml_config, save_all, method_wrapper_comm
+    )
 
     if not does_contain_sweep:
-        # we use half the memory for blocks since we typically have inputs/output
-        memory_limit = transform_limit_str_to_bytes(max_memory) // 2
-
-        if max_cpu_slices < 1:
-            raise ValueError("max-cpu-slices must be greater or equal to 1")
-        httomo.globals.MAX_CPU_SLICES = max_cpu_slices
-
-        _set_gpu_id(gpu_id)
-
-        # Run the pipeline using Taskrunner, with temp dir or reslice dir
-        mon = make_monitors(monitor, global_comm)
-        ctx: AbstractContextManager = nullcontext(reslice_dir)
-        if reslice_dir is None:
-            ctx = tempfile.TemporaryDirectory()
-        with ctx as tmp_dir:
-            runner = TaskRunner(
-                pipeline,
-                Path(tmp_dir),
-                global_comm,
-                monitor=mon,
-                memory_limit_bytes=memory_limit,
-            )
-            runner.execute()
-            if mon is not None:
-                mon.write_results(monitor_output)
+        execute_high_throughput_run(
+            pipeline,
+            global_comm,
+            gpu_id,
+            max_memory,
+            monitor,
+            monitor_output,
+            reslice_dir,
+        )
     else:
-        ParamSweepRunner(pipeline, global_comm).execute()
+        execute_sweep_run(pipeline, global_comm)
 
 
 def _check_yaml(yaml_config: Path, in_data: Path):
@@ -261,3 +247,98 @@ def _set_gpu_id(gpu_id: int):
 
     except ImportError:
         pass  # silently pass and run if the CPU pipeline is given
+
+
+def set_global_constants(
+    out_dir: Path,
+    intermediate_format: str,
+    compress_intermediate: bool,
+    frames_per_chunk: int,
+    max_cpu_slices: int,
+    syslog_host: str,
+    syslog_port: int,
+    output_folder_name: Optional[Path],
+    output_folder_path: Optional[Path],
+) -> None:
+    if compress_intermediate:
+        frames_per_chunk = 1
+    httomo.globals.INTERMEDIATE_FORMAT = intermediate_format
+    httomo.globals.COMPRESS_INTERMEDIATE = compress_intermediate
+    httomo.globals.FRAMES_PER_CHUNK = frames_per_chunk
+    httomo.globals.SYSLOG_SERVER = syslog_host
+    httomo.globals.SYSLOG_PORT = syslog_port
+
+    if output_folder_name is not None and output_folder_path is not None:
+        msg = (
+            "The flags `--output-folder-name` and `--output-folder-path` are mutually "
+            "exclusive, please use only one at most"
+        )
+        raise ValueError(msg)
+
+    if output_folder_name is None and output_folder_path is None:
+        httomo.globals.run_out_dir = out_dir.joinpath(
+            f"{datetime.now().strftime('%d-%m-%Y_%H_%M_%S')}_output"
+        )
+    if output_folder_name is not None:
+        httomo.globals.run_out_dir = out_dir.joinpath(output_folder_name)
+    if output_folder_path is not None:
+        httomo.globals.run_out_dir = output_folder_path
+
+    if max_cpu_slices < 1:
+        raise ValueError("max-cpu-slices must be greater or equal to 1")
+    httomo.globals.MAX_CPU_SLICES = max_cpu_slices
+
+
+def initialise_output_directory(yaml_config: Path) -> None:
+    Path.mkdir(httomo.globals.run_out_dir, exist_ok=True)
+    copy(yaml_config, httomo.globals.run_out_dir)
+
+
+def generate_pipeline(
+    in_data_file: Path, yaml_config: Path, save_all: bool, method_wrapper_comm: MPI.Comm
+) -> Pipeline:
+    # instantiate UiLayer class for pipeline build
+    init_UiLayer = UiLayer(yaml_config, in_data_file, comm=method_wrapper_comm)
+    pipeline = init_UiLayer.build_pipeline()
+
+    # perform transformations on pipeline
+    tr = TransformLayer(comm=method_wrapper_comm, save_all=save_all)
+    pipeline = tr.transform(pipeline)
+
+    return pipeline
+
+
+def execute_high_throughput_run(
+    pipeline: Pipeline,
+    global_comm: MPI.Comm,
+    gpu_id: int,
+    max_memory: str,
+    monitor: List[str],
+    monitor_output: TextIO,
+    reslice_dir: Union[Path, None],
+) -> None:
+    # we use half the memory for blocks since we typically have inputs/output
+    memory_limit = transform_limit_str_to_bytes(max_memory) // 2
+
+    _set_gpu_id(gpu_id)
+
+    # Run the pipeline using Taskrunner, with temp dir or reslice dir
+    mon = make_monitors(monitor, global_comm)
+    ctx: AbstractContextManager = nullcontext(reslice_dir)
+    if reslice_dir is None:
+        ctx = tempfile.TemporaryDirectory()
+    with ctx as tmp_dir:
+        runner = TaskRunner(
+            pipeline,
+            Path(tmp_dir),
+            global_comm,
+            monitor=mon,
+            memory_limit_bytes=memory_limit,
+        )
+        runner.execute()
+        if mon is not None:
+            mon.write_results(monitor_output)
+
+
+def execute_sweep_run(pipeline: Pipeline, global_comm: MPI.Comm) -> None:
+    ParamSweepRunner(pipeline, global_comm).execute()