From 9421756a845cde66f8277532bbaa1e3ac3cf2f33 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Mon, 31 Jan 2022 17:11:40 -0500 Subject: [PATCH 01/10] update readme with create_run_settings/create_batch_settings --- README.md | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index d450ee8ca..66cfdc805 100644 --- a/README.md +++ b/README.md @@ -122,11 +122,10 @@ program using the local launcher which is designed for laptops and single nodes. ```python from smartsim import Experiment -from smartsim.settings import RunSettings exp = Experiment("simple", launcher="local") -settings = RunSettings("echo", exe_args="Hello World") +settings = exp.create_run_settings("echo", exe_args="Hello World") model = exp.create_model("hello_world", settings) exp.start(model, block=True) @@ -148,13 +147,14 @@ For example, ``MpirunSettings`` can be used to launch MPI programs with openMPI. ```Python from smartsim import Experiment -from smartsim.settings import MpirunSettings exp = Experiment("hello_world", launcher="local") -mpi = MpirunSettings(exe="echo", exe_args="Hello World!") -mpi.set_tasks(4) +mpi_settings = exp.create_run_settings(exe="echo", + exe_args="Hello World!", + run_command="mpirun") +mpi_settings.set_tasks(4) -mpi_model = exp.create_model("hello_world", mpi) +mpi_model = exp.create_model("hello_world", mpi_settings) exp.start(mpi_model, block=True) print(exp.get_status(model)) @@ -185,10 +185,9 @@ salloc -N 1 -n 32 --exclusive -t 00:10:00 ```python # hello_world.py from smartsim import Experiment -from smartsim.settings import SrunSettings exp = Experiment("hello_world_exp", launcher="slurm") -srun = SrunSettings(exe="echo", exe_args="Hello World!") +srun = exp.create_run_settings(exe="echo", exe_args="Hello World!") srun.set_nodes(1) srun.set_tasks(32) @@ -230,16 +229,15 @@ The following launches 4 replicas of the the same ``hello_world`` model. ```python # hello_ensemble.py from smartsim import Experiment -from smartsim.settings import SrunSettings, SbatchSettings exp = Experiment("hello_world_batch", launcher="slurm") # define resources for all ensemble members -sbatch = SbatchSettings(nodes=4, time="00:10:00", account="12345-Cray") +sbatch = exp.create_batch_settings(nodes=4, time="00:10:00", account="12345-Cray") sbatch.set_partition("premium") # define how each member should run -srun = SrunSettings(exe="echo", exe_args="Hello World!") +srun = exp.create_run_settings(exe="echo", exe_args="Hello World!") srun.set_nodes(1) srun.set_tasks(32) @@ -262,16 +260,15 @@ launchers within SmartSim. ```python # hello_ensemble_pbs.py from smartsim import Experiment -from smartsim.settings import AprunSettings, QsubBatchSettings exp = Experiment("hello_world_batch", launcher="pbs") # define resources for all ensemble members -qsub = QsubBatchSettings(nodes=4, time="00:10:00", - account="12345-Cray", queue="cl40") +qsub = exp.create_batch_settings(nodes=4, time="00:10:00", + account="12345-Cray", queue="cl40") # define how each member should run -aprun = AprunSettings(exe="echo", exe_args="Hello World!") +aprun = exp.create_run_settings(exe="echo", exe_args="Hello World!") aprun.set_tasks(32) ensemble = exp.create_ensemble("hello_world", batch_settings=qsub, @@ -374,7 +371,7 @@ qsub -l select=3:ppn=1 -l walltime=00:10:00 -q cl40 -I from smartsim import Experiment from smartsim.database import PBSOrchestrator -exp = Experiment("db-on-slurm", launcher="slurm") +exp = Experiment("db-on-pbs", launcher="pbs") db_cluster = PBSOrchestrator(db_nodes=3, db_port=6780, batch=False) exp.start(db_cluster) @@ -401,7 +398,7 @@ to be launched. Users can hit CTRL-C to cancel the launch if needed. from smartsim import Experiment from smartsim.database import PBSOrchestrator -exp = Experiment("db-on-slurm", launcher="pbs") +exp = Experiment("batch-db-on-pbs", launcher="pbs") db_cluster = PBSOrchestrator(db_nodes=3, db_port=6780, batch=True, time="00:10:00", account="12345-Cray", queue="cl40") @@ -457,7 +454,7 @@ exp.generate(cluster, overwrite=True) exp.start(cluster, block=False, summary=True) # Connect to the Ray cluster -ctx = ray.init("ray://"+cluster.get_head_address()+":10001") +ctx = ray.init(f"ray://{cluster.get_head_address()}:10001") # ``` @@ -487,7 +484,7 @@ exp.generate(cluster, overwrite=True) exp.start(cluster, block=False, summary=True) # Connect to the ray cluster -ctx = ray.init("ray://"+cluster.get_head_address()+":10001") +ctx = ray.init(f"ray://{cluster.get_head_address()}:10001") # ``` @@ -544,7 +541,7 @@ connect to. ### Python Training code and Model construction are not shown here, but the example below -shows how to take a PyTorch model, sent it to the database, and execute it +shows how to take a PyTorch model, send it to the database, and execute it on data stored within the database. Notably the **GPU** argument is used to ensure that exection of the model @@ -619,7 +616,6 @@ client.unpack_tensor(out_key, result.data(), {10}, You can also load a model from file and put it in the database before you execute it. This example shows how this is done in Fortran. - ```fortran program run_mnist_example @@ -672,7 +668,6 @@ subroutine run_mnist( client, model_name ) end subroutine run_mnist end program run_mnist_example - ``` @@ -784,7 +779,7 @@ for i in range(0, time_steps): exp.stop(db) ``` More details about online anaylsis with SmartSim and the full code examples can be found in the -[SmartSim documentation](https://www.craylabs.org). #fix this +[SmartSim documentation](https://www.craylabs.org). ## Online Processing @@ -847,9 +842,9 @@ from C, C++, Fortran and Python with the SmartRedis Clients: | Library | Supported Version | |-------------------|:-----------------:| -| PyTorch | 1.7.1 | -| TensorFlow\Keras | 2.4.2 | -| ONNX | 1.7.0 | +| PyTorch | 1.7.1 | +| TensorFlow\Keras | 2.5.2 | +| ONNX | 1.7.0 | Note, it's important to remember that SmartSim utilizes a client-server model. To run experiments that utilize the above frameworks, you must first start the Orchestrator From 157f10ba31814e6dcfa0bdde4a435400e834d083 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Tue, 1 Feb 2022 11:38:45 -0500 Subject: [PATCH 02/10] update ml lib vers --- README.md | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 66cfdc805..66f69dc8b 100644 --- a/README.md +++ b/README.md @@ -48,12 +48,55 @@ the Orchestrator despite which language the data originated from. SmartSim supports the following ML libraries. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RedisAI VersionLibrariesSupported Version
1.2.3-1.2.4PyTorch1.7.0
Tensorlow\Keras2.5.2
ONNX1.7.0
1.2.5PyTorch1.9.1
Tensorlow\Keras2.6.2
ONNX1.9.0
+ A [number of other libraries](https://github.com/onnx/onnxmltools) are supported through ONNX, like [SciKit-Learn](https://github.com/onnx/sklearn-onnx/) and [XGBoost](https://github.com/onnx/onnxmltools/tree/master/tests/xgboost). @@ -840,11 +883,52 @@ to create your own. SmartSim supports the following frameworks for quering Machine Learning models from C, C++, Fortran and Python with the SmartRedis Clients: + +| PyTorch | 1.7.1 | | TensorFlow\Keras | 2.5.2 | -| ONNX | 1.7.0 | +| ONNX | 1.7.0 | +--> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RedisAI VersionLibrariesSupported Version
1.2.3-1.2.4PyTorch1.7.0
Tensorlow\Keras2.5.2
ONNX1.7.0
1.2.5PyTorch1.9.1
Tensorlow\Keras2.6.2
ONNX1.9.0
Note, it's important to remember that SmartSim utilizes a client-server model. To run experiments that utilize the above frameworks, you must first start the Orchestrator @@ -870,7 +954,7 @@ db = Orchestrator(port=6780) class Net(nn.Module): def __init__(self): - super(Net, self).__init__() + super().__init__() self.conv = nn.Conv2d(1, 1, 3) def forward(self, x): From 6278db8760340fb5877a26b229a36d2f557dc546 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Tue, 1 Feb 2022 11:42:16 -0500 Subject: [PATCH 03/10] remove old TODO --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 66f69dc8b..25258ad29 100644 --- a/README.md +++ b/README.md @@ -822,7 +822,7 @@ for i in range(0, time_steps): exp.stop(db) ``` More details about online anaylsis with SmartSim and the full code examples can be found in the -[SmartSim documentation](https://www.craylabs.org). +[SmartSim documentation](https://www.craylabs.org). ## Online Processing From 2e0dcc67716280981973a6cb6e6e8c4fca3780e6 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Tue, 1 Feb 2022 12:05:57 -0500 Subject: [PATCH 04/10] update ray, fix typos --- README.md | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 25258ad29..59d95fde2 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,9 @@ called Redis. Applications integrated with the SmartRedis clients, written in Fortran, C, C++ and Python, can stream tensors and datasets to and from the Orchestrator. The distributed Client-Server -paradigm allows for data to be seemlessly exchanged between applications at runtime. +paradigm allows for data to be seamlessly exchanged between applications at runtime. -In addition to exchanging data between langauges, any of the SmartRedis clients can +In addition to exchanging data between languages, any of the SmartRedis clients can remotely execute Machine Learning models and TorchScript code on data stored in the Orchestrator despite which language the data originated from. @@ -192,7 +192,7 @@ For example, ``MpirunSettings`` can be used to launch MPI programs with openMPI. from smartsim import Experiment exp = Experiment("hello_world", launcher="local") -mpi_settings = exp.create_run_settings(exe="echo", +mpi_settings = exp.create_run_settings(exe="echo", exe_args="Hello World!", run_command="mpirun") mpi_settings.set_tasks(4) @@ -359,7 +359,7 @@ db = Orchestrator(port=6780) # by default, SmartSim never blocks execution after the database is launched. exp.start(db) -# launch models, anaylsis, training, inference sessions, etc +# launch models, analysis, training, inference sessions, etc # that communicate with the database using the SmartRedis clients # stop the database @@ -393,7 +393,7 @@ db_cluster = SlurmOrchestrator(db_nodes=3, db_port=6780, batch=False) exp.start(db_cluster) print(f"Orchestrator launched on nodes: {db_cluster.hosts}") -# launch models, anaylsis, training, inference sessions, etc +# launch models, analysis, training, inference sessions, etc # that communicate with the database using the SmartRedis clients exp.stop(db_cluster) @@ -420,7 +420,7 @@ db_cluster = PBSOrchestrator(db_nodes=3, db_port=6780, batch=False) exp.start(db_cluster) print(f"Orchestrator launched on nodes: {db_cluster.hosts}") -# launch models, anaylsis, training, inference sessions, etc +# launch models, analysis, training, inference sessions, etc # that communicate with the database using the SmartRedis clients exp.stop(db_cluster) @@ -506,10 +506,13 @@ ctx = ray.init(f"ray://{cluster.get_head_address()}:10001") ### Ray on PBS Below is an example of how to launch a Ray cluster on a PBS system and connect to it. -In this example, we set `batch=True`, which means that the cluster will be started -requesting an allocation through Slurm. If this code is run within a sufficiently large -interactive allocation, setting `batch=False` will spin the Ray cluster on the -allocated nodes. +As we can see, only minor tweaks are needed to port our previous example to utilize +a different launcher. + +Once again, we set `batch=True`, which means that the cluster will be started +requesting an allocation, this time through PBS. If this code is run within a +sufficiently large interactive allocation, setting `batch=False` will spin the Ray +cluster on the allocated nodes. ```Python import ray @@ -547,10 +550,10 @@ Users can seamlessly pull and push data from the Orchestrator from different lan Tensors are the fundamental data structure for the SmartRedis clients. The Clients use the native array format of the language. For example, in Python, a tensor is -a NumPy array. The C++/C client accepts nested and contingous arrays. +a NumPy array. The C++/C client accepts nested and contiguous arrays. When stored in the database, all tensors are stored in the same format. Hence, -any language can recieve a tensor from the database no matter what supported language +any language can receive a tensor from the database no matter what supported language the array was sent from. This enables applications in different languages to communicate numerical data with each other at runtime (coupling). @@ -571,10 +574,10 @@ For more information on the API, see the ## Examples Even though the clients rely on the Orchestrator database to be running, it can be helpful -to see examples of how the API is used accross different languages even without the +to see examples of how the API is used across different languages even without the infrastructure code. The following examples provide simple examples of client usage. -For more imformation on the SmartRedis clients, see the +For more information on the SmartRedis clients, see the [API documentation](https://www.craylabs.org/docs/api/smartredis_api.html) and [tutorials](https://www.craylabs.org/docs/tutorials/smartredis.html). @@ -656,7 +659,7 @@ client.unpack_tensor(out_key, result.data(), {10}, ### Fortran -You can also load a model from file and put it in the database before you execute it. +You can also load a model from a file and put it in the database before you execute it. This example shows how this is done in Fortran. ```fortran @@ -694,7 +697,7 @@ subroutine run_mnist( client, model_name ) character(len=255), dimension(1) :: inputs character(len=255), dimension(1) :: outputs - ! Construct the keys used for the specifiying inputs and outputs + ! Construct the keys used for the specifying inputs and outputs in_key = "mnist_input" out_key = "mnist_output" @@ -718,7 +721,7 @@ end program run_mnist_example # SmartSim + SmartRedis SmartSim and SmartRedis were designed to work together. When launched through -SmartSim, applcations using the SmartRedis clients are directly connected to +SmartSim, applications using the SmartRedis clients are directly connected to any Orchestrator launched in the same Experiment. In this way, a SmartSim Experiment becomes a driver for coupled ML and Simulation @@ -745,7 +748,7 @@ Using a [Lattice Boltzmann Simulation](https://en.wikipedia.org/wiki/Lattice_Bol this example demonstrates how to use the SmartRedis ``Dataset`` API to stream data to the Orchestrator deployed by SmartSim. -The following code will show the peices of the simulation that are needed to +The following code will show the pieces of the simulation that are needed to transmit the data needed to plot timesteps of the simulation. ```Python @@ -753,7 +756,7 @@ transmit the data needed to plot timesteps of the simulation. from smartredis import Client import numpy as np -# initialization code ommitted +# initialization code omitted # save cylinder location to database cylinder = (X - x_res/4)**2 + (Y - y_res/2)**2 < (y_res/4)**2 # bool array @@ -821,7 +824,7 @@ for i in range(0, time_steps): exp.stop(db) ``` -More details about online anaylsis with SmartSim and the full code examples can be found in the +More details about online analysis with SmartSim and the full code examples can be found in the [SmartSim documentation](https://www.craylabs.org). @@ -871,7 +874,7 @@ V = client.get_tensor("V") print(f"U: {U}, S: {S}, V: {V}") ``` -The processing capabilties make it simple to form computational piplines of +The processing capabilities make it simple to form computational pipelines of functions, scripts, and models. See the full [TorchScript Language Reference](https://pytorch.org/docs/stable/jit.html#torchscript-language) @@ -880,10 +883,10 @@ to create your own. ## Online Inference -SmartSim supports the following frameworks for quering Machine Learning models +SmartSim supports the following frameworks for querying Machine Learning models from C, C++, Fortran and Python with the SmartRedis Clients: - - @@ -74,7 +62,7 @@ TODO: Check if new table is what is expected - + @@ -86,7 +74,7 @@ TODO: Check if new table is what is expected - + @@ -455,7 +443,7 @@ db_cluster = PBSOrchestrator(db_nodes=3, db_port=6780, batch=True, exp.start(db_cluster) print(f"Orchestrator launched on nodes: {db_cluster.hosts}") -# launch models, anaylsis, training, inference sessions, etc +# launch models, analysis, training, inference sessions, etc # that communicate with the database using the SmartRedis clients exp.stop(db_cluster) @@ -598,7 +586,7 @@ scripts if there is no database for them to connect to. The example below shows how to take a PyTorch model, send it to the Orchestrator, and execute it on data stored within the database. -Notice that when we set the model in the database, we set the device arguement to +Notice that when we set the model in the database, we set the device argument to **GPU**. By doing this we ensure that execution of the model takes place on a GPU if one is available to the database. @@ -902,16 +890,6 @@ to create your own. SmartSim supports the following frameworks for querying Machine Learning models from C, C++, Fortran and Python with the SmartRedis Clients: - -
1.7.0
Tensorlow\KerasTensorFlow\Keras 2.5.2
1.9.1
Tensorlow\KerasTensorFlow\Keras 2.6.2
@@ -927,7 +905,7 @@ TODO: Again confirm that this should be replaced with new table - + @@ -939,7 +917,7 @@ TODO: Again confirm that this should be replaced with new table - +
1.7.0
Tensorlow\KerasTensorFlow\Keras 2.5.2
1.9.1
Tensorlow\KerasTensorFlow\Keras 2.6.2