Skip to content

Commit

Permalink
Add slurm system setup
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Oct 3, 2024
1 parent 3e45407 commit f75e3a5
Show file tree
Hide file tree
Showing 14 changed files with 669 additions and 125 deletions.
7 changes: 6 additions & 1 deletion config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ _torchvision:
--loader: pytorch
--data: "{milabench_data}/FakeImageNet"


_torchvision_ddp:
inherits: _defaults
definition: ../benchmarks/torchvision_ddp
Expand Down Expand Up @@ -113,6 +114,7 @@ _timm:
--dataset: "FakeImageNet"
--workers: "auto({n_worker}, 8)"


_accelerate_opt:
inherits: _defaults
tags:
Expand Down Expand Up @@ -149,6 +151,7 @@ _accelerate_opt:
use_deepspeed: true
num_machines: 1


fp16:
inherits: _flops

Expand Down Expand Up @@ -388,6 +391,7 @@ brax:
--num-minibatches: 32
--num-envs: 8192


_diffusion:
inherits: _defaults
definition: ../benchmarks/diffusion
Expand Down Expand Up @@ -530,11 +534,11 @@ _llm:
definition: ../benchmarks/llm
install_group: torch


llm-lora-single:
inherits: _llm
plan:
method: per_gpu

argv:
"{milabench_code}/recipes/lora_finetune_single_device.py": true
--config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml"
Expand Down Expand Up @@ -596,6 +600,7 @@ llm-lora-ddp-nodes:
requires_capabilities:
- "len(nodes) >= ${num_machines}"


llm-lora-mp-gpus:
inherits: _llm
plan:
Expand Down
13 changes: 13 additions & 0 deletions config/cloud-multinodes-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,16 @@ system:
size: Standard_NV72ads_A10_v5
location: eastus2
disk_size: 512
slurm__a100_x2:
address: localhost
bashrc_path: "{bashrc_path}"
remote_workdir: "scratch/cov-{job_uuid}-workdir"
use_srun: null
options:
ntasks-per-node: 1
gpus-per-task: a100l:2
cpus-per-task: 12
time: "3:0:0"
mem: 500000
partition: short-unkillable
nodelist: cn-g[001-029]
24 changes: 24 additions & 0 deletions config/cloud-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,27 @@ system:
size: Standard_NV72ads_A10_v5
location: eastus2
disk_size: 512
slurm__a100_x1:
address: localhost
bashrc_path: "{bashrc_path}"
remote_workdir: "scratch/cov-{job_uuid}-workdir"
use_srun: null
options:
ntasks-per-node: 1
gpus-per-task: a100l:1
cpus-per-task: 6
time: "3:0:0"
mem: 32000
partition: unkillable
slurm__a100_x4:
address: localhost
bashrc_path: "{bashrc_path}"
remote_workdir: "scratch/cov-{job_uuid}-workdir"
use_srun: null
options:
ntasks-per-node: 1
gpus-per-task: a100l:4
cpus-per-task: 24
time: "3:0:0"
mem: 1000000
partition: short-unkillable
10 changes: 10 additions & 0 deletions config/examples/cloud-multinodes-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,13 @@ system:
volume_size: 8
region: us-east-2
state_id: 71669879043a3864225aabb94f91a2d4
slurm:
address: localhost
bashrc_path: "{bashrc_path}"
remote_workdir: "scratch/cov-{job_uuid}-workdir"
use_srun: null
options:
ntasks-per-node: 1
cpus-per-task: 1
time: "0:30:0"
mem: 1000
11 changes: 11 additions & 0 deletions config/examples/cloud-system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,14 @@ system:
instance_type: t2.micro
volume_size: 8
region: us-east-2
slurm:
# covalent-slurm-plugin args
address: localhost
bashrc_path: "{bashrc_path}"
remote_workdir: "scratch/cov-{job_uuid}-workdir"
use_srun: null
options:
ntasks-per-node: 1
cpus-per-task: 1
time: "0:30:0"
mem: 1000
89 changes: 85 additions & 4 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Create a cloud system configuration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
supported cloud profiles.
supported cloud and slurm profiles.

.. notes::

Expand Down Expand Up @@ -150,14 +150,95 @@ Run milabench on the cloud
^^^^^^^^^^^^^^^^^^^^^^^^^^

1. | Initialize the cloud instances
| ``milabench cloud --system {{SYSTEM_CONFIG.YAML}} --setup --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
| ``milabench cloud --setup --system {{SYSTEM_CONFIG.YAML}} --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
2. | Prepare, install and run milabench
| ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}``
3. | Destroy the cloud instances
| ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
| ``milabench cloud --teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
| or
| ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
| ``milabench cloud --teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
| to destroy not just a single cloud instance but all instances on a
specified platform that were instanced from the current local machine


Use milabench on slurm
~~~~~~~~~~~~~~~~~~~~~~


Create a slurm system configuration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
supported cloud and slurm profiles.

.. notes::

Nodes that should be created on the cloud should have the ``1.1.1.1`` ip
address placeholder. Other ip addresses will be used as-is and no cloud
instance will be created for that node

.. notes::

A cloud profile entry needs to start with a covalent plugin (e.g. `slurm`). To
define multiple profiles on the same cloud platform, use the form
``{PLATFORM}__{PROFILE_NAME}`` (e.g. ``slurm__profile``). All cloud profile
attributes will be used as is as argument for the target covalent plugin

.. code-block:: yaml
system:
nodes:
- name: manager
# Use 1.1.1.1 as an ip placeholder
ip: 1.1.1.1
main: true
user: <username>
- name: node1
ip: 1.1.1.1
main: false
user: <username>
# Cloud instances profiles
cloud_profiles:
# The cloud platform to use in the form of {PLATFORM} or
# {PLATFORM}__{PROFILE_NAME}
slurm:
username: usename
address: localhost
ssh_key_file: ssh_key_file
# bashrc_path will be replaced by the content of
# milabench/scripts/covalent/covalent_bashrc.sh
bashrc_path: "{bashrc_path}"
# job_uuid will be replaced by the generated job's uuid
remote_workdir: "cov-{job_uuid}-workdir"
use_srun: null
options:
ntasks-per-node: 1
cpus-per-task: 1
time: "0:30:0"
mem: 1000
Run milabench on slurm
^^^^^^^^^^^^^^^^^^^^^^

1. | Initialize the slurm instances
| ``milabench cloud --setup --system {{SYSTEM_CONFIG.YAML}} --run-on {{PROFILE}} >{{SYSTEM_SLURM_CONFIG.YAML}}``
2. | Prepare, install and run milabench
| ``milabench [prepare|install|run] --system {{SYSTEM_SLURM_CONFIG.YAML}}``
3. | Destroy the slurm instances
| ``milabench cloud --teardown --system {{SYSTEM_SLURM_CONFIG.YAML}} --run-on {{PROFILE}}``
.. notes::

Because the milabench's path is expected to be the same on local machine and
the remote machine, it's currently necessary to run the commands from the
slurm cluster. As the ``milabench cloud --[setup|teardown]`` commands requires
a covalent server to run and to avoid overloading the login nodes resources,
it's preferable to request a cpu compute node which will host to the covalent
server. An allocation with minimal resources like ``--nodes 1 --cpus-per-task
1 --mem 2000`` should be enough.
51 changes: 29 additions & 22 deletions milabench/cli/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,29 +60,33 @@ def manage_cloud(pack, run_on, action="setup"):
"private_ip":(lambda v: ("internal_ip",v)),
"username":(lambda v: ("user",v)),
"ssh_key_file":(lambda v: ("key",v)),
# "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
"env":(lambda v: ("env",[".", v, "milabench", "&&"])),
"slurm_job_id":(lambda v: ("slurm_job_id",v)),
}
plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
plan_params = pack.config["system"]["cloud_profiles"][run_on]
run_on, *profile = run_on.split("__")
profile = profile[0] if profile else ""
default_state_prefix = profile or run_on
default_state_id = "_".join((pack.config["hash"][:6], blabla()))

local_base = pack.dirs.base.absolute()
local_data_dir = _get_common_dir(ROOT_FOLDER.parent, local_base.parent)
if local_data_dir is None:
local_data_dir = local_base.parent
remote_data_dir = XPath("/data") / local_data_dir.name
plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
plan_params["state_id"] = plan_params.get("state_id", default_state_id)
plan_params["keep_alive"] = None

# local_base = pack.dirs.base.absolute()
# local_data_dir = _get_common_dir(ROOT_FOLDER.parent, local_base.parent)
# if local_data_dir is None:
# local_data_dir = local_base.parent
# remote_data_dir = XPath("/data") / local_data_dir.name

plan_params_copy = deepcopy(plan_params)

nodes = iter(enumerate(pack.config["system"]["nodes"]))
for i, n in nodes:
if n["ip"] != "1.1.1.1":
if n["ip"] != "1.1.1.1" and action == _SETUP:
continue

plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
plan_params["state_id"] = plan_params.get("state_id", default_state_id)
plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
plan_params["keep_alive"] = None
plan_params_copy["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)

import milabench.scripts.covalent as cv

Expand All @@ -101,17 +105,17 @@ def manage_cloud(pack, run_on, action="setup"):
"-m", cv.__name__,
run_on,
f"--{action}",
*_flatten_cli_args(**plan_params)
*_flatten_cli_args(**plan_params_copy)
]
if action == _SETUP:
cmd += [
"--",
"bash", "-c",
_or_sudo(f"mkdir -p '{local_data_dir.parent}'") +
" && " + _or_sudo(f"chmod a+rwX '{local_data_dir.parent}'") +
f" && mkdir -p '{remote_data_dir}'"
f" && ln -sfT '{remote_data_dir}' '{local_data_dir}'"
]
# if action == _SETUP:
# cmd += [
# "--",
# "bash", "-c",
# _or_sudo(f"mkdir -p '{local_data_dir.parent}'") +
# " && " + _or_sudo(f"chmod a+rwX '{local_data_dir.parent}'") +
# f" && mkdir -p '{remote_data_dir}'"
# f" && ln -sfT '{remote_data_dir}' '{local_data_dir}'"
# ]
p = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
Expand Down Expand Up @@ -155,6 +159,9 @@ def manage_cloud(pack, run_on, action="setup"):
stderr
)

if action == _TEARDOWN:
break

return pack.config["system"]


Expand Down
11 changes: 10 additions & 1 deletion milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,11 @@ def _find_node_config(self) -> Dict:
return n
return {}

def _load_env(self, node):
if node.get("env", None):
return node["env"]
return []

def is_local(self):
localnode = self.pack.config["system"]["self"]

Expand Down Expand Up @@ -484,7 +489,7 @@ def _argv(self, **kwargs) -> List:
argv.append(f"-p{self.port}")
argv.append(host)

return argv # + ["env", "-i"]
return argv + self._load_env(node)


class SCPCommand(SSHCommand, CmdCommand):
Expand All @@ -505,6 +510,10 @@ def __init__(
self.src = src
self.dest = dest if dest is not None else self.src

def _load_env(self, node):
del node
return []

def _argv(self, **kwargs) -> List:
argv = super()._argv(**kwargs)

Expand Down
Loading

0 comments on commit f75e3a5

Please sign in to comment.