Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Colocated model fix #237

Merged
merged 3 commits into from
Oct 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,26 @@ Jump to :ref:`SmartRedis Changelog <changelog>`
SmartSim
========

Development branch
------------------

To be released at some future date

Note

This section details changes made in the development branch that have not yet been applied to a released version of the SmartSim library.

Description

- Fix bug in colocated database entrypoint when loading PyTorch models

Detailed Notes

- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (PR237_)

.. _PR237: https://github.com/CrayLabs/SmartSim/pull/237


0.4.1
-----

Expand Down
3 changes: 3 additions & 0 deletions smartsim/_core/entrypoints/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def launch_db_model(client: Client, db_model: List[str]):
parser.add_argument("--min_batch_timeout", type=int, default=None)
args = parser.parse_args(db_model)

inputs = None
outputs = None

if args.inputs:
inputs = list(args.inputs)
if args.outputs:
Expand Down
163 changes: 141 additions & 22 deletions tests/backends/test_dbmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,27 @@
from smartsim._core.utils import installed_redisai_backends
from smartsim.error.errors import SSUnsupportedError

should_run = True
should_run_tf = True
should_run_pt = True

# Check TensorFlow is available for tests
try:
import tensorflow.keras as keras
from tensorflow.keras.layers import Conv2D, Input
except ImportError:
should_run = False
should_run_tf = False

should_run &= "tensorflow" in installed_redisai_backends()
should_run_tf &= "tensorflow" in installed_redisai_backends()

if not should_run:
pytest.skip("Test needs TF to run", allow_module_level=True)
# Check if PyTorch is available for tests
try:
import torch
import torch.nn as nn
import torch.nn.functional as F
except ImportError:
should_run_pt = False

should_run_pt &= "torch" in installed_redisai_backends()

class Net(keras.Model):
def __init__(self):
Expand Down Expand Up @@ -56,15 +65,50 @@ def create_tf_cnn():

return serialize_model(model)


def test_db_model(fileutils, wlmutils):
"""Test DB Models on remote DB"""

exp_name = "test-db-model"
# Simple MNIST in PyTorch
try:
class PyTorchNet(nn.Module):
def __init__(self):
super(PyTorchNet, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
except Exception:
should_run_pt = False

def save_torch_cnn(path, file_name):
n = PyTorchNet()
example_forward_input = torch.rand(1, 1, 28, 28)
module = torch.jit.trace(n, example_forward_input)
torch.jit.save(module, path+"/"+file_name)

@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
def test_tf_db_model(fileutils, wlmutils):
"""Test TensorFlow DB Models on remote DB"""

exp_name = "test-tf-db-model"

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py")
sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")

exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
# create colocated model
Expand Down Expand Up @@ -110,14 +154,58 @@ def test_db_model(fileutils, wlmutils):
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run")
def test_pt_db_model(fileutils, wlmutils):
"""Test PyTorch DB Models on remote DB"""

exp_name = "test-pt-db-model"

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py")

exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
# create colocated model
run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)

smartsim_model = exp.create_model("smartsim_model", run_settings)
smartsim_model.set_path(test_dir)

db = exp.create_database(port=wlmutils.get_test_port(), interface="lo")
exp.generate(db)

save_torch_cnn(test_dir, "model1.pt")
model_path = test_dir + "/model1.pt"

smartsim_model.add_ml_model(
"cnn",
"TORCH",
model_path=model_path,
device="CPU",
tag="test",
)

for db_model in smartsim_model._db_models:
print(db_model)

# Assert we have added both models
assert len(smartsim_model._db_models) == 1

exp.start(db, smartsim_model, block=True)
statuses = exp.get_status(smartsim_model)
exp.stop(db)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
def test_db_model_ensemble(fileutils, wlmutils):
"""Test DBModels on remote DB, with an ensemble"""

exp_name = "test-db-model-ensemble"

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py")
sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")

exp = Experiment(exp_name, exp_path=test_dir, launcher="local")
# create colocated model
Expand Down Expand Up @@ -174,15 +262,15 @@ def test_db_model_ensemble(fileutils, wlmutils):
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


def test_colocated_db_model(fileutils, wlmutils):
"""Test DB Models on colocated DB"""
def test_colocated_db_model_tf(fileutils, wlmutils):
"""Test DB Models on colocated DB (TensorFlow backend)"""

exp_name = "test-colocated-db-model"
exp_name = "test-colocated-db-model-tf"
exp = Experiment(exp_name, launcher="local")

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py")
sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")

# create colocated model
colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
Expand Down Expand Up @@ -215,8 +303,39 @@ def test_colocated_db_model(fileutils, wlmutils):
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])

@pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run")
def test_colocated_db_model_pytorch(fileutils, wlmutils):
"""Test DB Models on colocated DB (PyTorch backend)"""

exp_name = "test-colocated-db-model-pytorch"
exp = Experiment(exp_name, launcher="local")

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py")

# create colocated model
colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)

colo_model = exp.create_model("colocated_model", colo_settings)
colo_model.set_path(test_dir)
colo_model.colocate_db(
port=wlmutils.get_test_port(), db_cpus=1, limit_app_cpus=False, debug=True, ifname="lo"
)

save_torch_cnn(test_dir, "model1.pt")
model_file = test_dir + "/model1.pt"
colo_model.add_ml_model("cnn", "TORCH", model_path=model_file, device="CPU")

# Assert we have added both models
assert len(colo_model._db_models) == 1

exp.start(colo_model, block=True)
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(not should_run, reason="Test needs TF to run")
@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
def test_colocated_db_model_ensemble(fileutils, wlmutils):
"""Test DBModel on colocated ensembles, first colocating DB,
then adding DBModel.
Expand All @@ -227,7 +346,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
# get test setup
test_dir = fileutils.make_test_dir()
exp = Experiment(exp_name, launcher="local", exp_path=test_dir)
sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py")
sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")

# create colocated model
colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
Expand Down Expand Up @@ -294,7 +413,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils):
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(not should_run, reason="Test needs TF to run")
@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
"""Test DBModel on colocated ensembles, first adding the DBModel to the
ensemble, then colocating DB.
Expand All @@ -305,7 +424,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
# get test setup
test_dir = fileutils.make_test_dir()
exp = Experiment(exp_name, launcher="local", exp_path=test_dir)
sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py")
sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")

# create colocated model
colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
Expand Down Expand Up @@ -363,7 +482,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils):
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(not should_run, reason="Test needs TF to run")
@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run")
def test_colocated_db_model_errors(fileutils, wlmutils):
"""Test error when colocated db model has no file."""

Expand All @@ -372,7 +491,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils):

# get test setup
test_dir = fileutils.make_test_dir()
sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py")
sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py")

# create colocated model
colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script)
Expand Down
20 changes: 20 additions & 0 deletions tests/test_configs/run_pt_dbmodel_smartredis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import numpy as np
from smartredis import Client

def main():
# Address should be set as we are launching through
# SmartSim.
client = Client(cluster=False)

array = np.ones((1, 1, 28, 28)).astype(np.single)
client.put_tensor("test_array", array)
assert client.poll_model("cnn", 500, 30)
client.run_model("cnn", ["test_array"], ["test_output"])
returned = client.get_tensor("test_output")

assert returned.shape == (1, 10)

print(f"Test worked!")

if __name__ == "__main__":
main()