Skip to content

Commit

Permalink
Merge pull request #847 from douglasjacobsen/cosmoflow-app
Browse files Browse the repository at this point in the history
Add application definition for Cosmoflow
  • Loading branch information
douglasjacobsen authored Jan 29, 2025
2 parents 44914ea + 570562b commit 6d7ca63
Showing 1 changed file with 337 additions and 0 deletions.
337 changes: 337 additions & 0 deletions var/ramble/repos/builtin/applications/py-cosmoflow/application.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
# Copyright 2022-2025 The Ramble Authors
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

from ramble.appkit import *
import os

import ruamel.yaml as yaml
import spack.util.spack_yaml as syaml
import ramble.util.yaml_generation

from spack.util.path import canonicalize_path


class PyCosmoflow(ExecutableApplication):
"""This is a an implementation of the CosmoFlow 3D convolutional neural
network for benchmarking. It is written in TensorFlow with the Keras API
and uses Horovod for distributed training.
"""

name = "py-cosmoflow"

tags("mlperf-hpc")

default_config_string = "{default_config_value}"

input_file(
"cosmoUniverse_mini",
url="https://portal.nersc.gov/project/dasrepo/cosmoflow-benchmark/cosmoUniverse_2019_05_4parE_tf_v2_mini.tar",
description="Cosmoflow Universe - Mini input",
)

input_file(
"cosmoUniverse",
url="https://portal.nersc.gov/project/dasrepo/cosmoflow-benchmark/cosmoUniverse_2019_05_4parE_tf_v2.tar",
description="Cosmoflow Universe - Main input",
)

input_file(
"mlperf-hpc",
url="https://github.com/mlcommons/hpc/archive/refs/heads/main.tar.gz",
description="MLPerf HPC Repo",
)

executable(
"execute",
"python {mlperf-hpc}/cosmoflow/train.py -d --rank-gpu {cosmoflow_config}",
use_mpi=True,
)

workload(
"cosmoUniverse_mini",
executables=["execute"],
inputs=["cosmoUniverse_mini", "mlperf-hpc"],
)

workload(
"cosmoUniverse",
executables=["execute"],
inputs=["cosmoUniverse", "mlperf-hpc"],
)

workload_group(
"all_workloads", workloads=["cosmoUniverse_mini", "cosmoUniverse"]
)

workload_variable(
"dockerfile_path",
default="{mlperf-hpc}/cosmoflow/builds/Dockerfile",
description="Dockerfile for cosmoflow from the MLPerf-HPC repo",
workload_group="all_workloads",
)

workload_variable(
"docker_tag_name",
default="cosmoflow",
description="Name of docker image tag",
workload_group="all_workloads",
)

workload_variable(
"docker_tag_version",
default="1.0",
description="Version of docker image tag",
workload_group="all_workloads",
)

workload_variable(
"cosmoflow_config",
default=os.path.join("{experiment_run_dir}", "cosmo.yaml"),
description="Name of generated input for cosmoflow",
workload_group="all_workloads",
)

workload_variable(
"data.data_dir",
default="{cosmoUniverse_mini}",
description="Cosmoflow Data Directory",
workload_group="all_workloads",
)

workload_variable(
"data.n_train",
default="1024",
description="Number of training data sets",
workload="cosmoUniverse_mini",
)

workload_variable(
"data.n_valid",
default="1024",
description="Number of valid data sets",
workload="cosmoUniverse_mini",
)

workload_variable(
"data.n_train",
default="524288",
description="Number of training data sets",
workload="cosmoUniverse",
)

workload_variable(
"data.n_valid",
default="65536",
description="Number of valid data sets",
workload="cosmoUniverse",
)

workload_variable(
"mlperf.org",
default="ramble",
description="Organization for reporting MLPerf results",
workload_group="all_workloads",
)

workload_variable(
"mlperf.division",
default="experiments",
description="Division for reporting MLPerf results",
workload_group="all_workloads",
)

workload_variable(
"mlperf.status",
default="unknown",
description="Cluster status for reporting MLPerf results",
workload_group="all_workloads",
)

workload_variable(
"mlperf.platform",
default="unknown",
description="Platform name for reporting MLPerf results",
workload_group="all_workloads",
)

workload_variable(
"output_dir",
default="{experiment_run_dir}",
description="Experiment output directory",
workload_group="all_workloads",
)

workload_variable(
"cosmoflow_base_config",
default=os.path.join(
"{mlperf-hpc}", "cosmoflow", "configs", "cosmo.yaml"
),
description="Base configuration file to generate cosmoflow inputs from",
workload_group="all_workloads",
)

figure_of_merit(
"Best Epoch",
fom_regex=r".*INFO\s+epoch: (?P<idx>[0-9]+)",
group_name="idx",
units="",
)

figure_of_merit(
"Best Epoch Loss",
fom_regex=r".*INFO\s+loss: (?P<loss>[0-9\.]+)",
group_name="loss",
units="",
)

figure_of_merit(
"Best Epoch LR",
fom_regex=r".*INFO\s+lr: (?P<lr>[0-9\.]+)",
group_name="lr",
units="",
)

figure_of_merit(
"Best Epoch Mean Absolute Error",
fom_regex=r".*INFO\s+mean_absolute_error: (?P<abs_err>[0-9\.]+)",
group_name="abs_err",
units="",
)

figure_of_merit(
"Best Epoch Time",
fom_regex=r".*INFO\s+time: (?P<time>[0-9\.]+)",
group_name="time",
units="s",
)

figure_of_merit(
"Best Epoch Val Loss",
fom_regex=r".*INFO\s+val_loss: (?P<val_loss>[0-9\.]+)",
group_name="val_loss",
units="",
)

figure_of_merit(
"Best Epoch Val Mean Absolute Error",
fom_regex=r".*INFO\s+val_mean_absolute_error: (?P<val_error>[0-9\.]+)",
group_name="val_error",
units="",
)

figure_of_merit(
"Total epoch time",
fom_regex=r".*INFO\s+Total epoch time: (?P<total_time>[0-9\.]+)",
group_name="total_time",
units="s",
)

figure_of_merit(
"Mean epoch time",
fom_regex=r".*INFO\s+Mean epoch time: (?P<mean_time>[0-9\.]+)",
group_name="mean_time",
units="s",
)

register_phase(
"ingest_default_configs", pipeline="setup", run_after=["get_inputs"]
)

def _ingest_default_configs(self, workspace, app_inst):
"""Read config options from nemo_base_config, and define any that were
not defined in the input ramble.yaml or workload definition."""

base_config = get_file_path(
canonicalize_path(
self.expander.expand_var_name("cosmoflow_base_config")
),
workspace,
)

# Avoid problems with missing base config files
if not os.path.exists(base_config):
return

config_data = ramble.util.yaml_generation.read_config_file(base_config)

for option_name in ramble.util.yaml_generation.all_config_options(
config_data
):
if option_name not in self.variables:
value = ramble.util.yaml_generation.get_config_value(
config_data, option_name
)

self.define_variable(option_name, value)

register_phase(
"write_config", pipeline="setup", run_after=["make_experiments"]
)

def _write_config(self, workspace, app_inst):

base_config = get_file_path(
canonicalize_path(
self.expander.expand_var_name("cosmoflow_base_config")
),
workspace,
)

if not os.path.exists(base_config):
return

config_data = ramble.util.yaml_generation.read_config_file(base_config)

ramble.util.yaml_generation.apply_default_config_values(
config_data, self, self.default_config_string
)

# Set config options in config_data
for var_name in self.variables:
if "." in var_name and len(var_name.split(".")) > 1:
var_val = self.expander.expand_var(
self.expander.expansion_str(var_name), typed=True
)

# Convert any invalid tuples back to their default strings.
if isinstance(var_val, tuple):
var_val = self.expander.expand_var(
self.expander.expansion_str(var_name)
)
elif isinstance(var_val, list):
for i in range(0, len(var_val)):
var_val[i] = self.expander.expand_var(
var_val[i], typed=True
)

ramble.util.yaml_generation.set_config_value(
config_data, var_name, var_val, force=True
)

# Set output dir:
ramble.util.yaml_generation.set_config_value(
config_data,
"output_dir",
self.expander.expand_var_name("output_dir"),
)

# Create string of yaml data.
config_str = yaml.dump(
config_data,
default_flow_style=False,
width=syaml.maxint,
Dumper=syaml.OrderedLineDumper,
)

# Write out experiment config
config_path = canonicalize_path(
self.expander.expand_var("{cosmoflow_config}"),
)

with open(config_path, "w+") as f:
f.write(config_str)

0 comments on commit 6d7ca63

Please sign in to comment.