Skip to content

Commit

Permalink
Promote Build Device Option to Enum (#527)
Browse files Browse the repository at this point in the history
Removing instances of ["CPU","GPU"] with a `Device` Enum.

[ reviewed by @MattToast ]
[ committed by @amandarichardsonn ]
  • Loading branch information
amandarichardsonn authored Mar 28, 2024
1 parent 13d0302 commit fa0da2c
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 85 deletions.
3 changes: 3 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ To be released at some future point in time

Description

- Promote device options to an Enum
- Update telemetry monitor, add telemetry collectors
- Add method to specify node features for a Slurm job
- Colo Orchestrator setup now blocks application start until setup finished
Expand All @@ -33,6 +34,7 @@ Description

Detailed Notes

- Promote devices to a dedicated Enum type throughout the SmartSim code base.
- Update the telemetry monitor to enable retrieval of metrics on a scheduled
interval. Switch basic experiment tracking telemetry to default to on. Add
database metric collectors. Improve telemetry monitor logging. Create
Expand Down Expand Up @@ -70,6 +72,7 @@ Detailed Notes
- Remove previously deprecated behavior present in test suite on machines with
Slurm and Open MPI. (SmartSim-PR520_)

.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498
.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512
.. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529
Expand Down
33 changes: 15 additions & 18 deletions smartsim/_core/_cli/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
VersionConflictError,
Versioner,
)
from smartsim._core._install.builder import BuildError
from smartsim._core._install.builder import BuildError, Device
from smartsim._core.config import CONFIG
from smartsim._core.utils.helpers import installed_redisai_backends
from smartsim.error import SSConfigError
Expand All @@ -54,8 +54,6 @@
# NOTE: all smartsim modules need full paths as the smart cli
# may be installed into a different directory.


_TDeviceStr = t.Literal["cpu", "gpu"]
_TPinningStr = t.Literal["==", "!=", ">=", ">", "<=", "<", "~="]


Expand Down Expand Up @@ -134,7 +132,7 @@ def build_database(
def build_redis_ai(
build_env: BuildEnv,
versions: Versioner,
device: _TDeviceStr,
device: Device,
use_torch: bool = True,
use_tf: bool = True,
use_onnx: bool = False,
Expand All @@ -143,7 +141,7 @@ def build_redis_ai(
verbose: bool = False,
) -> None:
# make sure user isn't trying to do something silly on MacOS
if build_env.PLATFORM == "darwin" and device == "gpu":
if build_env.PLATFORM == "darwin" and device == Device.GPU:
raise BuildError("SmartSim does not support GPU on MacOS")

# decide which runtimes to build
Expand All @@ -154,7 +152,7 @@ def build_redis_ai(
["ONNX", versions.ONNX, color_bool(use_onnx)],
]
print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n")
print(f"Building for GPU support: {color_bool(device == 'gpu')}\n")
print(f"Building for GPU support: {color_bool(device == Device.GPU)}\n")

if not check_backends_install():
sys.exit(1)
Expand Down Expand Up @@ -195,7 +193,7 @@ def build_redis_ai(
else:
# get the build environment, update with CUDNN env vars
# if present and building for GPU, otherwise warn the user
if device == "gpu":
if device == Device.GPU:
gpu_env = build_env.get_cudnn_env()
cudnn_env_vars = [
"CUDNN_LIBRARY",
Expand Down Expand Up @@ -226,18 +224,16 @@ def build_redis_ai(
logger.info("ML Backends and RedisAI build complete!")


def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") -> None:
def check_py_torch_version(versions: Versioner, device: Device = Device.CPU) -> None:
"""Check Python environment for TensorFlow installation"""

device = device_in.lower()
if BuildEnv.is_macos():
if device == "gpu":
if device == Device.GPU:
raise BuildError("SmartSim does not support GPU on MacOS")
device_suffix = ""
else: # linux
if device == "cpu":
if device == Device.CPU:
device_suffix = versions.TORCH_CPU_SUFFIX
elif device == "gpu":
elif device == Device.GPU:
device_suffix = versions.TORCH_CUDA_SUFFIX
else:
raise BuildError("Unrecognized device requested")
Expand All @@ -261,7 +257,9 @@ def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu")
"Torch version not found in python environment. "
"Attempting to install via `pip`"
)
wheel_device = device if device == "cpu" else device_suffix.replace("+", "")
wheel_device = (
device.value if device == Device.CPU else device_suffix.replace("+", "")
)
pip(
"install",
"--extra-index-url",
Expand Down Expand Up @@ -363,8 +361,7 @@ def execute(
) -> int:
verbose = args.v
keydb = args.keydb
device: _TDeviceStr = args.device

device = Device(args.device.lower())
# torch and tf build by default
pt = not args.no_pt # pylint: disable=invalid-name
tf = not args.no_tf # pylint: disable=invalid-name
Expand Down Expand Up @@ -453,8 +450,8 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--device",
type=str.lower,
default="cpu",
choices=["cpu", "gpu"],
default=Device.CPU.value,
choices=[device.value for device in Device],
help="Device to build ML runtimes for",
)
parser.add_argument(
Expand Down
33 changes: 18 additions & 15 deletions smartsim/_core/_cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

from smartsim import Experiment
from smartsim._core._cli.utils import SMART_LOGGER_FORMAT
from smartsim._core._install.builder import Device
from smartsim._core.utils.helpers import installed_redisai_backends
from smartsim.log import get_logger

Expand All @@ -61,9 +62,6 @@
_TemporaryDirectory = tempfile.TemporaryDirectory


_TCapitalDeviceStr = t.Literal["CPU", "GPU"]


class _VerificationTempDir(_TemporaryDirectory):
"""A Temporary directory to be used as a context manager that will only
clean itself up if no error is raised within its context
Expand All @@ -88,7 +86,7 @@ def execute(
simple experiment
"""
backends = installed_redisai_backends()
device: _TCapitalDeviceStr = args.device.upper()
device: Device = Device(args.device)
try:
with contextlib.ExitStack() as ctx:
temp_dir = ctx.enter_context(_VerificationTempDir(dir=os.getcwd()))
Expand All @@ -98,7 +96,7 @@ def execute(
"SR_LOG_FILE", os.path.join(temp_dir, "smartredis.log")
),
}
if device == "GPU":
if device == Device.GPU:
validate_env["CUDA_VISIBLE_DEVICES"] = "0"
ctx.enter_context(_env_vars_set_to(validate_env))
test_install(
Expand Down Expand Up @@ -136,16 +134,16 @@ def configure_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--device",
type=str.lower,
default="cpu",
choices=["cpu", "gpu"],
default=Device.CPU.value,
choices=[device.value for device in Device],
help="Device to test the ML backends against",
)


def test_install(
location: str,
port: t.Optional[int],
device: _TCapitalDeviceStr,
device: Device,
with_tf: bool,
with_pt: bool,
with_onnx: bool,
Expand Down Expand Up @@ -214,7 +212,7 @@ def _find_free_port() -> int:
return int(port)


def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -> None:
def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None:
recv_conn, send_conn = mp.Pipe(duplex=False)
# Build the model in a subproc so that keras does not hog the gpu
proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir))
Expand All @@ -236,7 +234,12 @@ def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -
) from e

client.set_model_from_file(
"keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs
"keras-fcn",
model_path,
"TF",
device=device.value.upper(),
inputs=inputs,
outputs=outputs,
)
client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32))
client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"])
Expand Down Expand Up @@ -264,7 +267,7 @@ def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None:
conn.send((model_path, inputs, outputs))


def _test_torch_install(client: Client, device: _TCapitalDeviceStr) -> None:
def _test_torch_install(client: Client, device: Device) -> None:
import torch
from torch import nn

Expand All @@ -276,7 +279,7 @@ def __init__(self) -> None:
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.conv(x)

if device == "GPU":
if device == Device.GPU:
device_ = torch.device("cuda")
else:
device_ = torch.device("cpu")
Expand All @@ -292,13 +295,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
torch.jit.save(traced, buffer) # type: ignore[no-untyped-call]
model = buffer.getvalue()

client.set_model("torch-nn", model, backend="TORCH", device=device)
client.set_model("torch-nn", model, backend="TORCH", device=device.value.upper())
client.put_tensor("torch-in", torch.rand(1, 1, 3, 3).numpy())
client.run_model("torch-nn", inputs=["torch-in"], outputs=["torch-out"])
client.get_tensor("torch-out")


def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None:
def _test_onnx_install(client: Client, device: Device) -> None:
from skl2onnx import to_onnx
from sklearn.cluster import KMeans

Expand All @@ -311,7 +314,7 @@ def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None:
sample = np.arange(20, dtype=np.float32).reshape(10, 2)

client.put_tensor("onnx-input", sample)
client.set_model("onnx-kmeans", model, "ONNX", device=device)
client.set_model("onnx-kmeans", model, "ONNX", device=device.value.upper())
client.run_model(
"onnx-kmeans", inputs=["onnx-input"], outputs=["onnx-labels", "onnx-transform"]
)
Expand Down
Loading

0 comments on commit fa0da2c

Please sign in to comment.