diff --git a/doc/changelog.rst b/doc/changelog.rst index 9262f5294..b6e7e8e50 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -11,6 +11,26 @@ Jump to :ref:`SmartRedis Changelog ` SmartSim ======== +Development branch +------------------ + +To be released at some future date + +Note + +This section details changes made in the development branch that have not yet been applied to a released version of the SmartSim library. + +Description + +- Fix bug in colocated database entrypoint when loading PyTorch models + +Detailed Notes + +- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (PR237_) + +.. _PR237: https://github.com/CrayLabs/SmartSim/pull/237 + + 0.4.1 ----- diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 6c22708ef..4d75c1e98 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -81,6 +81,9 @@ def launch_db_model(client: Client, db_model: List[str]): parser.add_argument("--min_batch_timeout", type=int, default=None) args = parser.parse_args(db_model) + inputs = None + outputs = None + if args.inputs: inputs = list(args.inputs) if args.outputs: diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 47fa72ef3..f57ada80e 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -7,18 +7,27 @@ from smartsim._core.utils import installed_redisai_backends from smartsim.error.errors import SSUnsupportedError -should_run = True +should_run_tf = True +should_run_pt = True +# Check TensorFlow is available for tests try: import tensorflow.keras as keras from tensorflow.keras.layers import Conv2D, Input except ImportError: - should_run = False + should_run_tf = False -should_run &= "tensorflow" in installed_redisai_backends() +should_run_tf &= "tensorflow" in installed_redisai_backends() -if not should_run: - pytest.skip("Test needs TF to run", allow_module_level=True) +# Check if PyTorch is available for tests +try: + import torch + import torch.nn as nn + import torch.nn.functional as F +except ImportError: + should_run_pt = False + +should_run_pt &= "torch" in installed_redisai_backends() class Net(keras.Model): def __init__(self): @@ -56,15 +65,50 @@ def create_tf_cnn(): return serialize_model(model) - -def test_db_model(fileutils, wlmutils): - """Test DB Models on remote DB""" - - exp_name = "test-db-model" +# Simple MNIST in PyTorch +try: + class PyTorchNet(nn.Module): + def __init__(self): + super(PyTorchNet, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output +except Exception: + should_run_pt = False + +def save_torch_cnn(path, file_name): + n = PyTorchNet() + example_forward_input = torch.rand(1, 1, 28, 28) + module = torch.jit.trace(n, example_forward_input) + torch.jit.save(module, path+"/"+file_name) + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") +def test_tf_db_model(fileutils, wlmutils): + """Test TensorFlow DB Models on remote DB""" + + exp_name = "test-tf-db-model" # get test setup test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") exp = Experiment(exp_name, exp_path=test_dir, launcher="local") # create colocated model @@ -110,6 +154,50 @@ def test_db_model(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) +@pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") +def test_pt_db_model(fileutils, wlmutils): + """Test PyTorch DB Models on remote DB""" + + exp_name = "test-pt-db-model" + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") + + exp = Experiment(exp_name, exp_path=test_dir, launcher="local") + # create colocated model + run_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) + + db = exp.create_database(port=wlmutils.get_test_port(), interface="lo") + exp.generate(db) + + save_torch_cnn(test_dir, "model1.pt") + model_path = test_dir + "/model1.pt" + + smartsim_model.add_ml_model( + "cnn", + "TORCH", + model_path=model_path, + device="CPU", + tag="test", + ) + + for db_model in smartsim_model._db_models: + print(db_model) + + # Assert we have added both models + assert len(smartsim_model._db_models) == 1 + + exp.start(db, smartsim_model, block=True) + statuses = exp.get_status(smartsim_model) + exp.stop(db) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_db_model_ensemble(fileutils, wlmutils): """Test DBModels on remote DB, with an ensemble""" @@ -117,7 +205,7 @@ def test_db_model_ensemble(fileutils, wlmutils): # get test setup test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") exp = Experiment(exp_name, exp_path=test_dir, launcher="local") # create colocated model @@ -174,15 +262,15 @@ def test_db_model_ensemble(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_colocated_db_model(fileutils, wlmutils): - """Test DB Models on colocated DB""" +def test_colocated_db_model_tf(fileutils, wlmutils): + """Test DB Models on colocated DB (TensorFlow backend)""" - exp_name = "test-colocated-db-model" + exp_name = "test-colocated-db-model-tf" exp = Experiment(exp_name, launcher="local") # get test setup test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # create colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) @@ -215,8 +303,39 @@ def test_colocated_db_model(fileutils, wlmutils): statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) +@pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") +def test_colocated_db_model_pytorch(fileutils, wlmutils): + """Test DB Models on colocated DB (PyTorch backend)""" + + exp_name = "test-colocated-db-model-pytorch" + exp = Experiment(exp_name, launcher="local") + + # get test setup + test_dir = fileutils.make_test_dir() + sr_test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") + + # create colocated model + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) + + colo_model = exp.create_model("colocated_model", colo_settings) + colo_model.set_path(test_dir) + colo_model.colocate_db( + port=wlmutils.get_test_port(), db_cpus=1, limit_app_cpus=False, debug=True, ifname="lo" + ) + + save_torch_cnn(test_dir, "model1.pt") + model_file = test_dir + "/model1.pt" + colo_model.add_ml_model("cnn", "TORCH", model_path=model_file, device="CPU") + + # Assert we have added both models + assert len(colo_model._db_models) == 1 + + exp.start(colo_model, block=True) + statuses = exp.get_status(colo_model) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + -@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_colocated_db_model_ensemble(fileutils, wlmutils): """Test DBModel on colocated ensembles, first colocating DB, then adding DBModel. @@ -227,7 +346,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): # get test setup test_dir = fileutils.make_test_dir() exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # create colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) @@ -294,7 +413,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): """Test DBModel on colocated ensembles, first adding the DBModel to the ensemble, then colocating DB. @@ -305,7 +424,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): # get test setup test_dir = fileutils.make_test_dir() exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # create colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) @@ -363,7 +482,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.skipif(not should_run, reason="Test needs TF to run") +@pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_colocated_db_model_errors(fileutils, wlmutils): """Test error when colocated db model has no file.""" @@ -372,7 +491,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils): # get test setup test_dir = fileutils.make_test_dir() - sr_test_script = fileutils.get_test_conf_path("run_dbmodel_smartredis.py") + sr_test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # create colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=sr_test_script) diff --git a/tests/test_configs/run_pt_dbmodel_smartredis.py b/tests/test_configs/run_pt_dbmodel_smartredis.py new file mode 100644 index 000000000..c9be4472f --- /dev/null +++ b/tests/test_configs/run_pt_dbmodel_smartredis.py @@ -0,0 +1,20 @@ +import numpy as np +from smartredis import Client + +def main(): + # Address should be set as we are launching through + # SmartSim. + client = Client(cluster=False) + + array = np.ones((1, 1, 28, 28)).astype(np.single) + client.put_tensor("test_array", array) + assert client.poll_model("cnn", 500, 30) + client.run_model("cnn", ["test_array"], ["test_output"]) + returned = client.get_tensor("test_output") + + assert returned.shape == (1, 10) + + print(f"Test worked!") + +if __name__ == "__main__": + main() diff --git a/tests/test_configs/run_dbmodel_smartredis.py b/tests/test_configs/run_tf_dbmodel_smartredis.py similarity index 100% rename from tests/test_configs/run_dbmodel_smartredis.py rename to tests/test_configs/run_tf_dbmodel_smartredis.py