diff --git a/var/ramble/repos/builtin/applications/py-cosmoflow/application.py b/var/ramble/repos/builtin/applications/py-cosmoflow/application.py new file mode 100644 index 000000000..56699d264 --- /dev/null +++ b/var/ramble/repos/builtin/applications/py-cosmoflow/application.py @@ -0,0 +1,337 @@ +# Copyright 2022-2025 The Ramble Authors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +from ramble.appkit import * +import os + +import ruamel.yaml as yaml +import spack.util.spack_yaml as syaml +import ramble.util.yaml_generation + +from spack.util.path import canonicalize_path + + +class PyCosmoflow(ExecutableApplication): + """This is a an implementation of the CosmoFlow 3D convolutional neural + network for benchmarking. It is written in TensorFlow with the Keras API + and uses Horovod for distributed training. + """ + + name = "py-cosmoflow" + + tags("mlperf-hpc") + + default_config_string = "{default_config_value}" + + input_file( + "cosmoUniverse_mini", + url="https://portal.nersc.gov/project/dasrepo/cosmoflow-benchmark/cosmoUniverse_2019_05_4parE_tf_v2_mini.tar", + description="Cosmoflow Universe - Mini input", + ) + + input_file( + "cosmoUniverse", + url="https://portal.nersc.gov/project/dasrepo/cosmoflow-benchmark/cosmoUniverse_2019_05_4parE_tf_v2.tar", + description="Cosmoflow Universe - Main input", + ) + + input_file( + "mlperf-hpc", + url="https://github.com/mlcommons/hpc/archive/refs/heads/main.tar.gz", + description="MLPerf HPC Repo", + ) + + executable( + "execute", + "python {mlperf-hpc}/cosmoflow/train.py -d --rank-gpu {cosmoflow_config}", + use_mpi=True, + ) + + workload( + "cosmoUniverse_mini", + executables=["execute"], + inputs=["cosmoUniverse_mini", "mlperf-hpc"], + ) + + workload( + "cosmoUniverse", + executables=["execute"], + inputs=["cosmoUniverse", "mlperf-hpc"], + ) + + workload_group( + "all_workloads", workloads=["cosmoUniverse_mini", "cosmoUniverse"] + ) + + workload_variable( + "dockerfile_path", + default="{mlperf-hpc}/cosmoflow/builds/Dockerfile", + description="Dockerfile for cosmoflow from the MLPerf-HPC repo", + workload_group="all_workloads", + ) + + workload_variable( + "docker_tag_name", + default="cosmoflow", + description="Name of docker image tag", + workload_group="all_workloads", + ) + + workload_variable( + "docker_tag_version", + default="1.0", + description="Version of docker image tag", + workload_group="all_workloads", + ) + + workload_variable( + "cosmoflow_config", + default=os.path.join("{experiment_run_dir}", "cosmo.yaml"), + description="Name of generated input for cosmoflow", + workload_group="all_workloads", + ) + + workload_variable( + "data.data_dir", + default="{cosmoUniverse_mini}", + description="Cosmoflow Data Directory", + workload_group="all_workloads", + ) + + workload_variable( + "data.n_train", + default="1024", + description="Number of training data sets", + workload="cosmoUniverse_mini", + ) + + workload_variable( + "data.n_valid", + default="1024", + description="Number of valid data sets", + workload="cosmoUniverse_mini", + ) + + workload_variable( + "data.n_train", + default="524288", + description="Number of training data sets", + workload="cosmoUniverse", + ) + + workload_variable( + "data.n_valid", + default="65536", + description="Number of valid data sets", + workload="cosmoUniverse", + ) + + workload_variable( + "mlperf.org", + default="ramble", + description="Organization for reporting MLPerf results", + workload_group="all_workloads", + ) + + workload_variable( + "mlperf.division", + default="experiments", + description="Division for reporting MLPerf results", + workload_group="all_workloads", + ) + + workload_variable( + "mlperf.status", + default="unknown", + description="Cluster status for reporting MLPerf results", + workload_group="all_workloads", + ) + + workload_variable( + "mlperf.platform", + default="unknown", + description="Platform name for reporting MLPerf results", + workload_group="all_workloads", + ) + + workload_variable( + "output_dir", + default="{experiment_run_dir}", + description="Experiment output directory", + workload_group="all_workloads", + ) + + workload_variable( + "cosmoflow_base_config", + default=os.path.join( + "{mlperf-hpc}", "cosmoflow", "configs", "cosmo.yaml" + ), + description="Base configuration file to generate cosmoflow inputs from", + workload_group="all_workloads", + ) + + figure_of_merit( + "Best Epoch", + fom_regex=r".*INFO\s+epoch: (?P[0-9]+)", + group_name="idx", + units="", + ) + + figure_of_merit( + "Best Epoch Loss", + fom_regex=r".*INFO\s+loss: (?P[0-9\.]+)", + group_name="loss", + units="", + ) + + figure_of_merit( + "Best Epoch LR", + fom_regex=r".*INFO\s+lr: (?P[0-9\.]+)", + group_name="lr", + units="", + ) + + figure_of_merit( + "Best Epoch Mean Absolute Error", + fom_regex=r".*INFO\s+mean_absolute_error: (?P[0-9\.]+)", + group_name="abs_err", + units="", + ) + + figure_of_merit( + "Best Epoch Time", + fom_regex=r".*INFO\s+time: (?P