From 35413277de2e594ea3b331aebe784c0070d05a29 Mon Sep 17 00:00:00 2001
From: Andrey Velichkevich <andrey.velichkevich@gmail.com>
Date: Fri, 11 Oct 2024 19:52:11 +0100
Subject: [PATCH] [SDK] Use torchrun to create PyTorchJob from function (#2276)

* [SDK] Use torchrun to create PyTorchJob from function

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Update PyTorchJob SDK example

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Add consts for entrypoint

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Add check for num procs per worker

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

---------

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
---
 .../Train-CNN-with-FashionMNIST.ipynb         | 437 ++++++++----------
 .../kubeflow/training/api/training_client.py  |  58 ++-
 .../training/api/training_client_test.py      | 253 ++++++----
 .../kubeflow/training/constants/constants.py  |   4 +
 sdk/python/kubeflow/training/utils/utils.py   |  50 +-
 5 files changed, 434 insertions(+), 368 deletions(-)

diff --git a/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb b/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb
index 7cb5e8fba9..002bd8a3fd 100644
--- a/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb
+++ b/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb
@@ -33,8 +33,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install torch==1.12.1\n",
-    "!pip install torchvision==0.13.1\n",
+    "!pip install torch==2.1.2\n",
+    "!pip install torchvision==0.19.1\n",
     "\n",
     "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
     "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
@@ -52,16 +52,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "69f21f33-5c64-452c-90c4-977fc0dadb3b",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:44:44.851155Z",
-     "iopub.status.busy": "2024-03-05T21:44:44.850918Z",
-     "iopub.status.idle": "2024-03-05T21:44:44.862195Z",
-     "shell.execute_reply": "2024-03-05T21:44:44.860949Z",
-     "shell.execute_reply.started": "2024-03-05T21:44:44.851138Z"
-    },
     "tags": []
    },
    "outputs": [],
@@ -102,11 +95,6 @@
     "            x = self.fc2(x)\n",
     "            return F.log_softmax(x, dim=1)\n",
     "\n",
-    "    # Get dist parameters.\n",
-    "    # Kubeflow Training Operator automatically set appropriate RANK and WORLD_SIZE based on the configuration.\n",
-    "    RANK = int(os.environ[\"RANK\"])\n",
-    "    WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n",
-    "\n",
     "    # IF GPU is available, nccl dist backend is used. Otherwise, gloo dist backend is used.\n",
     "    if torch.cuda.is_available():\n",
     "        device = \"cuda\"\n",
@@ -114,17 +102,26 @@
     "    else:\n",
     "        device = \"cpu\"\n",
     "        backend = \"gloo\"\n",
-    "\n",
+    "    \n",
     "    logging.info(f\"Using Device: {device}, Backend: {backend}\")\n",
     "\n",
-    "    model = Net()\n",
-    "    # Attach model to the device.\n",
-    "    model = model.to(device)\n",
+    "    # Setup PyTorch DDP. Distributed environment will be set automatically by Training Operator.\n",
+    "    dist.init_process_group(backend=backend)\n",
+    "    Distributor = torch.nn.parallel.DistributedDataParallel\n",
+    "    local_rank = int(os.getenv(\"LOCAL_RANK\", 0))\n",
+    "    logging.info(\n",
+    "        \"Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}\".format(\n",
+    "            dist.get_world_size(),\n",
+    "            dist.get_rank(),\n",
+    "            local_rank,\n",
+    "        )\n",
+    "    )\n",
     "\n",
-    "    # Attach model to DistributedDataParallel strategy.\n",
-    "    dist.init_process_group(backend=\"gloo\", rank=RANK, world_size=WORLD_SIZE)\n",
-    "    Distributor = nn.parallel.DistributedDataParallel\n",
+    "    # Attach model to the correct device.\n",
+    "    device = torch.device(f\"{device}:{local_rank}\")\n",
+    "    model = Net().to(device)\n",
     "    model = Distributor(model)\n",
+    "    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n",
     "\n",
     "    # Get Fashion MNIST Dataset.\n",
     "    dataset = datasets.FashionMNIST(\n",
@@ -142,11 +139,10 @@
     "    )\n",
     "\n",
     "    # Start Training.\n",
-    "    logging.info(f\"Start training for RANK: {RANK}. WORLD_SIZE: {WORLD_SIZE}\")\n",
+    "    logging.info(f\"Start training for RANK: {dist.get_rank()}. WORLD_SIZE: {dist.get_world_size()}\")\n",
     "\n",
     "    for epoch in range(int(parameters[\"NUM_EPOCHS\"])):\n",
     "        model.train()\n",
-    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n",
     "\n",
     "        for batch_idx, (data, target) in enumerate(train_loader):\n",
     "            # Attach tensors to the device.\n",
@@ -158,7 +154,7 @@
     "            loss = F.nll_loss(output, target)\n",
     "            loss.backward()\n",
     "            optimizer.step()\n",
-    "            if batch_idx % 10 == 0:\n",
+    "            if batch_idx % 10 == 0 and dist.get_rank() == 0:\n",
     "                logging.info(\n",
     "                    \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n",
     "                        epoch,\n",
@@ -168,8 +164,8 @@
     "                        loss.item(),\n",
     "                    )\n",
     "                )\n",
-    "\n",
-    "    logging.info(\"Training is finished\")"
+    "    if dist.get_rank() == 0:\n",
+    "        logging.info(\"Training is finished\")"
    ]
   },
   {
@@ -195,13 +191,6 @@
    "execution_count": 3,
    "id": "9e2c6fd8-d0ba-4bc6-ac90-d4cf09751ace",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:44:47.071420Z",
-     "iopub.status.busy": "2024-03-05T21:44:47.071188Z",
-     "iopub.status.idle": "2024-03-05T21:46:56.033826Z",
-     "shell.execute_reply": "2024-03-05T21:46:56.032986Z",
-     "shell.execute_reply.started": "2024-03-05T21:44:47.071404Z"
-    },
     "tags": []
    },
    "outputs": [
@@ -209,9 +198,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-05T21:44:47Z INFO     Using Device: cpu, Backend: gloo\n",
-      "2024-03-05T21:44:47Z INFO     Added key: store_based_barrier_key:1 to store for rank: 0\n",
-      "2024-03-05T21:44:47Z INFO     Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.\n"
+      "2024-10-08T13:58:29Z INFO     Using Device: cpu, Backend: gloo\n",
+      "2024-10-08T13:58:29Z INFO     Distributed Training for WORLD_SIZE: 1, RANK: 0, LOCAL_RANK: 0\n"
      ]
     },
     {
@@ -223,18 +211,11 @@
      ]
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f84c269459b842199b83caaee8bee276",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/26421880 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 26421880/26421880 [00:02<00:00, 9155631.80it/s] \n"
+     ]
     },
     {
      "name": "stdout",
@@ -247,18 +228,11 @@
      ]
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d30ad8fbf5764375b67d92ed2ad00a0f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/29515 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 29515/29515 [00:00<00:00, 1364085.84it/s]"
+     ]
     },
     {
      "name": "stdout",
@@ -266,23 +240,29 @@
      "text": [
       "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
       "\n",
-      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
      ]
     },
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7b55bfdfad3f4732b4465f126024ba15",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/4422102 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 4422102/4422102 [00:00<00:00, 8802674.51it/s]\n"
+     ]
     },
     {
      "name": "stdout",
@@ -294,25 +274,11 @@
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
      ]
     },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "901afb3fdcae42fd909f835ed7502b8c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/5148 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-05T21:44:54Z INFO     Start training for RANK: 0. WORLD_SIZE: 1\n"
+      "100%|██████████| 5148/5148 [00:00<00:00, 8424610.61it/s]"
      ]
     },
     {
@@ -327,55 +293,56 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-05T21:44:54Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3120\n",
-      "2024-03-05T21:44:54Z INFO     Reducer buckets have been rebuilt in this iteration.\n",
-      "2024-03-05T21:44:57Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.3065\n",
-      "2024-03-05T21:44:59Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2853\n",
-      "2024-03-05T21:45:02Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.2836\n",
-      "2024-03-05T21:45:05Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=2.2772\n",
-      "2024-03-05T21:45:07Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=2.2551\n",
-      "2024-03-05T21:45:10Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=2.2426\n",
-      "2024-03-05T21:45:13Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=2.2236\n",
-      "2024-03-05T21:45:16Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=2.1883\n",
-      "2024-03-05T21:45:18Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=2.1629\n",
-      "2024-03-05T21:45:21Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=2.0908\n",
-      "2024-03-05T21:45:23Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=2.0023\n",
-      "2024-03-05T21:45:25Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=1.8673\n",
-      "2024-03-05T21:45:28Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=1.7227\n",
-      "2024-03-05T21:45:30Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=1.5780\n",
-      "2024-03-05T21:45:33Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=1.3583\n",
-      "2024-03-05T21:45:36Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=1.2497\n",
-      "2024-03-05T21:45:38Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=1.0678\n",
-      "2024-03-05T21:45:41Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0032\n",
-      "2024-03-05T21:45:43Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=1.0607\n",
-      "2024-03-05T21:45:46Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=1.1423\n",
-      "2024-03-05T21:45:48Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.9529\n",
-      "2024-03-05T21:45:51Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=1.0706\n",
-      "2024-03-05T21:45:53Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=1.0542\n",
-      "2024-03-05T21:45:56Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.8257\n",
-      "2024-03-05T21:45:59Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=1.0569\n",
-      "2024-03-05T21:46:01Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.8526\n",
-      "2024-03-05T21:46:04Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.8178\n",
-      "2024-03-05T21:46:06Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.9800\n",
-      "2024-03-05T21:46:09Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.9983\n",
-      "2024-03-05T21:46:11Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=1.0125\n",
-      "2024-03-05T21:46:14Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.8065\n",
-      "2024-03-05T21:46:17Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=1.0279\n",
-      "2024-03-05T21:46:19Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6834\n",
-      "2024-03-05T21:46:22Z INFO     Train Epoch: 0 [43520/60000 (72%)]\tloss=0.9552\n",
-      "2024-03-05T21:46:25Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.9345\n",
-      "2024-03-05T21:46:27Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.7476\n",
-      "2024-03-05T21:46:30Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.9566\n",
-      "2024-03-05T21:46:32Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.9356\n",
-      "2024-03-05T21:46:35Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.6601\n",
-      "2024-03-05T21:46:38Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.8929\n",
-      "2024-03-05T21:46:40Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.7665\n",
-      "2024-03-05T21:46:43Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.7904\n",
-      "2024-03-05T21:46:45Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.8413\n",
-      "2024-03-05T21:46:48Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.7340\n",
-      "2024-03-05T21:46:51Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.7770\n",
-      "2024-03-05T21:46:53Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.6644\n",
-      "2024-03-05T21:46:56Z INFO     Training is finished\n"
+      "\n",
+      "2024-10-08T13:58:33Z INFO     Start training for RANK: 0. WORLD_SIZE: 1\n",
+      "2024-10-08T13:58:33Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3017\n",
+      "2024-10-08T13:58:33Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2850\n",
+      "2024-10-08T13:58:34Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2844\n",
+      "2024-10-08T13:58:34Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.2718\n",
+      "2024-10-08T13:58:34Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=2.2489\n",
+      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=2.2405\n",
+      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=2.2178\n",
+      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=2.1755\n",
+      "2024-10-08T13:58:35Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=2.1326\n",
+      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=2.0784\n",
+      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=1.9585\n",
+      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=1.8107\n",
+      "2024-10-08T13:58:36Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=1.6047\n",
+      "2024-10-08T13:58:37Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=1.4722\n",
+      "2024-10-08T13:58:37Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=1.3473\n",
+      "2024-10-08T13:58:37Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=1.2142\n",
+      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=1.1584\n",
+      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=1.0055\n",
+      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=0.9729\n",
+      "2024-10-08T13:58:38Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=1.0776\n",
+      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=1.1153\n",
+      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.9125\n",
+      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=1.0451\n",
+      "2024-10-08T13:58:39Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=1.0821\n",
+      "2024-10-08T13:58:40Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.7935\n",
+      "2024-10-08T13:58:40Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=1.0418\n",
+      "2024-10-08T13:58:40Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.8537\n",
+      "2024-10-08T13:58:41Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.8402\n",
+      "2024-10-08T13:58:41Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.9968\n",
+      "2024-10-08T13:58:41Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.9956\n",
+      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=1.0038\n",
+      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.8188\n",
+      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=1.0646\n",
+      "2024-10-08T13:58:42Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6951\n",
+      "2024-10-08T13:58:43Z INFO     Train Epoch: 0 [43520/60000 (72%)]\tloss=0.9384\n",
+      "2024-10-08T13:58:43Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.9681\n",
+      "2024-10-08T13:58:43Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.7699\n",
+      "2024-10-08T13:58:44Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.9631\n",
+      "2024-10-08T13:58:44Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.9253\n",
+      "2024-10-08T13:58:44Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.6612\n",
+      "2024-10-08T13:58:45Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.9142\n",
+      "2024-10-08T13:58:45Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.7794\n",
+      "2024-10-08T13:58:46Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.8052\n",
+      "2024-10-08T13:58:46Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.8630\n",
+      "2024-10-08T13:58:46Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.7686\n",
+      "2024-10-08T13:58:47Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.7940\n",
+      "2024-10-08T13:58:47Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.6830\n",
+      "2024-10-08T13:58:47Z INFO     Training is finished\n"
      ]
     }
    ],
@@ -384,6 +351,7 @@
     "import os\n",
     "\n",
     "os.environ[\"RANK\"] = \"0\"\n",
+    "os.environ[\"LOCAL_RANK\"] = \"0\"\n",
     "os.environ[\"WORLD_SIZE\"] = \"1\"\n",
     "os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
     "os.environ[\"MASTER_PORT\"] = \"1234\"\n",
@@ -410,16 +378,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "id": "eb1acd34-ebcf-409b-8bb3-0225cee37110",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:47:01.685451Z",
-     "iopub.status.busy": "2024-03-05T21:47:01.684682Z",
-     "iopub.status.idle": "2024-03-05T21:47:01.946879Z",
-     "shell.execute_reply": "2024-03-05T21:47:01.945531Z",
-     "shell.execute_reply.started": "2024-03-05T21:47:01.685429Z"
-    },
     "tags": []
    },
    "outputs": [],
@@ -436,7 +397,9 @@
     "    name=pytorchjob_name,\n",
     "    train_func=train_pytorch_model,\n",
     "    parameters={\"NUM_EPOCHS\": \"3\"}, # Input parameters for the train function.\n",
-    "    num_workers=3,  # How many PyTorch Workers will be created.\n",
+    "    num_workers=2,  # How many PyTorch Nodes will be created.\n",
+    "    num_procs_per_worker=2, # How many procs per node will be used (e.g. number of CPUs/GPUs in a single Node)\n",
+    "    resources_per_worker={\"cpu\": \"2\"}\n",
     ")"
    ]
   },
@@ -452,16 +415,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "id": "4141f6c2-c38f-4972-b68a-35d150ef7485",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:47:05.021345Z",
-     "iopub.status.busy": "2024-03-05T21:47:05.020992Z",
-     "iopub.status.idle": "2024-03-05T21:47:05.046311Z",
-     "shell.execute_reply": "2024-03-05T21:47:05.044855Z",
-     "shell.execute_reply.started": "2024-03-05T21:47:05.021327Z"
-    },
     "tags": []
    },
    "outputs": [
@@ -469,7 +425,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorchJob Status: True\n"
+      "PyTorchJob Status: False\n"
      ]
     }
    ],
@@ -489,26 +445,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "id": "49b53308-a19b-45e8-942f-4333e727ee48",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:47:07.929048Z",
-     "iopub.status.busy": "2024-03-05T21:47:07.928812Z",
-     "iopub.status.idle": "2024-03-05T21:47:07.963438Z",
-     "shell.execute_reply": "2024-03-05T21:47:07.962346Z",
-     "shell.execute_reply.started": "2024-03-05T21:47:07.929033Z"
-    },
     "tags": []
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['train-pytorch-master-0', 'train-pytorch-worker-0', 'train-pytorch-worker-1']"
+       "['train-pytorch-master-0', 'train-pytorch-worker-0']"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -539,16 +488,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "5232d542-d4bf-4c51-8b11-ad0534fb0b9d",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:50:34.618377Z",
-     "iopub.status.busy": "2024-03-05T21:50:34.618140Z",
-     "iopub.status.idle": "2024-03-05T21:50:34.666228Z",
-     "shell.execute_reply": "2024-03-05T21:50:34.664982Z",
-     "shell.execute_reply.started": "2024-03-05T21:50:34.618362Z"
-    },
     "tags": []
    },
    "outputs": [
@@ -556,77 +498,101 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-03-05T21:47:03Z INFO     Using Device: cpu, Backend: gloo\n",
+      "[2024-10-08 13:25:08,740] torch.distributed.run: [WARNING] master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified.\n",
+      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] \n",
+      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] *****************************************\n",
+      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n",
+      "[2024-10-08 13:25:08,741] torch.distributed.run: [WARNING] *****************************************\n",
+      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
+      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
+      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
+      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
+      "[W socket.cpp:663] [c10d] The IPv6 network addresses of (train-pytorch-worker-0, 23456) cannot be retrieved (gai error: -2 - Name or service not known).\n",
+      "2024-10-08T13:25:15Z INFO     Using Device: cpu, Backend: gloo\n",
+      "2024-10-08T13:25:15Z INFO     Using Device: cpu, Backend: gloo\n",
+      "2024-10-08T13:25:16Z INFO     Distributed Training for WORLD_SIZE: 4, RANK: 0, LOCAL_RANK: 0\n",
+      "2024-10-08T13:25:16Z INFO     Distributed Training for WORLD_SIZE: 4, RANK: 1, LOCAL_RANK: 1\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
-      "100%|██████████| 26421880/26421880 [00:02<00:00, 12643822.46it/s]\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
+      "100%|██████████| 26421880/26421880 [00:02<00:00, 12700502.50it/s]\n",
+      "100%|██████████| 26421880/26421880 [00:02<00:00, 12593356.31it/s]\n",
       "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
       "\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
+      "100%|██████████| 29515/29515 [00:00<00:00, 212712.93it/s]\n",
+      "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
+      "100%|██████████| 29515/29515 [00:00<00:00, 212353.88it/s]\n",
       "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
-      "100%|██████████| 29515/29515 [00:00<00:00, 209382.13it/s]\n",
       "\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
-      "100%|██████████| 4422102/4422102 [00:00<00:00, 5191230.97it/s]\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
+      "100%|██████████| 4422102/4422102 [00:05<00:00, 744014.92it/s] \n",
       "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
       "\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
       "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
-      "100%|██████████| 5148/5148 [00:00<00:00, 57733360.94it/s]\n",
+      "100%|██████████| 5148/5148 [00:00<00:00, 48197046.86it/s]t/s]\n",
       "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
       "\n",
-      "2024-03-05T21:47:16Z INFO     Start training for RANK: 0. WORLD_SIZE: 3\n",
-      "2024-03-05T21:47:17Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.2958\n",
-      "2024-03-05T21:47:21Z INFO     Train Epoch: 0 [1280/60000 (6%)]\tloss=2.2902\n",
-      "2024-03-05T21:47:26Z INFO     Train Epoch: 0 [2560/60000 (13%)]\tloss=2.2863\n",
-      "2024-03-05T21:47:29Z INFO     Train Epoch: 0 [3840/60000 (19%)]\tloss=2.2719\n",
-      "2024-03-05T21:47:33Z INFO     Train Epoch: 0 [5120/60000 (25%)]\tloss=2.2630\n",
-      "2024-03-05T21:47:38Z INFO     Train Epoch: 0 [6400/60000 (32%)]\tloss=2.2362\n",
-      "2024-03-05T21:47:42Z INFO     Train Epoch: 0 [7680/60000 (38%)]\tloss=2.2007\n",
-      "2024-03-05T21:47:45Z INFO     Train Epoch: 0 [8960/60000 (45%)]\tloss=2.1735\n",
-      "2024-03-05T21:47:49Z INFO     Train Epoch: 0 [10240/60000 (51%)]\tloss=2.1185\n",
-      "2024-03-05T21:47:53Z INFO     Train Epoch: 0 [11520/60000 (57%)]\tloss=2.0732\n",
-      "2024-03-05T21:47:58Z INFO     Train Epoch: 0 [12800/60000 (64%)]\tloss=1.9446\n",
-      "2024-03-05T21:48:02Z INFO     Train Epoch: 0 [14080/60000 (70%)]\tloss=1.7036\n",
-      "2024-03-05T21:48:06Z INFO     Train Epoch: 0 [15360/60000 (76%)]\tloss=1.6633\n",
-      "2024-03-05T21:48:10Z INFO     Train Epoch: 0 [16640/60000 (83%)]\tloss=1.3292\n",
-      "2024-03-05T21:48:14Z INFO     Train Epoch: 0 [17920/60000 (89%)]\tloss=1.3720\n",
-      "2024-03-05T21:48:18Z INFO     Train Epoch: 0 [19200/60000 (96%)]\tloss=1.1136\n",
-      "2024-03-05T21:48:21Z INFO     Train Epoch: 1 [0/60000 (0%)]\tloss=0.9830\n",
-      "2024-03-05T21:48:25Z INFO     Train Epoch: 1 [1280/60000 (6%)]\tloss=1.1548\n",
-      "2024-03-05T21:48:29Z INFO     Train Epoch: 1 [2560/60000 (13%)]\tloss=0.9911\n",
-      "2024-03-05T21:48:33Z INFO     Train Epoch: 1 [3840/60000 (19%)]\tloss=0.8874\n",
-      "2024-03-05T21:48:36Z INFO     Train Epoch: 1 [5120/60000 (25%)]\tloss=1.0731\n",
-      "2024-03-05T21:48:39Z INFO     Train Epoch: 1 [6400/60000 (32%)]\tloss=0.7734\n",
-      "2024-03-05T21:48:44Z INFO     Train Epoch: 1 [7680/60000 (38%)]\tloss=0.7550\n",
-      "2024-03-05T21:48:47Z INFO     Train Epoch: 1 [8960/60000 (45%)]\tloss=0.9045\n",
-      "2024-03-05T21:48:51Z INFO     Train Epoch: 1 [10240/60000 (51%)]\tloss=0.8567\n",
-      "2024-03-05T21:48:55Z INFO     Train Epoch: 1 [11520/60000 (57%)]\tloss=0.9150\n",
-      "2024-03-05T21:48:59Z INFO     Train Epoch: 1 [12800/60000 (64%)]\tloss=0.8769\n",
-      "2024-03-05T21:49:02Z INFO     Train Epoch: 1 [14080/60000 (70%)]\tloss=0.8903\n",
-      "2024-03-05T21:49:07Z INFO     Train Epoch: 1 [15360/60000 (76%)]\tloss=0.9694\n",
-      "2024-03-05T21:49:11Z INFO     Train Epoch: 1 [16640/60000 (83%)]\tloss=0.7397\n",
-      "2024-03-05T21:49:14Z INFO     Train Epoch: 1 [17920/60000 (89%)]\tloss=0.9153\n",
-      "2024-03-05T21:49:18Z INFO     Train Epoch: 1 [19200/60000 (96%)]\tloss=0.7878\n",
-      "2024-03-05T21:49:20Z INFO     Train Epoch: 2 [0/60000 (0%)]\tloss=0.6716\n",
-      "2024-03-05T21:49:25Z INFO     Train Epoch: 2 [1280/60000 (6%)]\tloss=0.9508\n",
-      "2024-03-05T21:49:29Z INFO     Train Epoch: 2 [2560/60000 (13%)]\tloss=0.7306\n",
-      "2024-03-05T21:49:34Z INFO     Train Epoch: 2 [3840/60000 (19%)]\tloss=0.7212\n",
-      "2024-03-05T21:49:38Z INFO     Train Epoch: 2 [5120/60000 (25%)]\tloss=0.9555\n",
-      "2024-03-05T21:49:42Z INFO     Train Epoch: 2 [6400/60000 (32%)]\tloss=0.6037\n",
-      "2024-03-05T21:49:45Z INFO     Train Epoch: 2 [7680/60000 (38%)]\tloss=0.6307\n",
-      "2024-03-05T21:49:49Z INFO     Train Epoch: 2 [8960/60000 (45%)]\tloss=0.7282\n",
-      "2024-03-05T21:49:54Z INFO     Train Epoch: 2 [10240/60000 (51%)]\tloss=0.7196\n",
-      "2024-03-05T21:49:57Z INFO     Train Epoch: 2 [11520/60000 (57%)]\tloss=0.7827\n",
-      "2024-03-05T21:50:01Z INFO     Train Epoch: 2 [12800/60000 (64%)]\tloss=0.7565\n",
-      "2024-03-05T21:50:05Z INFO     Train Epoch: 2 [14080/60000 (70%)]\tloss=0.7726\n",
-      "2024-03-05T21:50:08Z INFO     Train Epoch: 2 [15360/60000 (76%)]\tloss=0.8723\n",
-      "2024-03-05T21:50:12Z INFO     Train Epoch: 2 [16640/60000 (83%)]\tloss=0.6533\n",
-      "2024-03-05T21:50:15Z INFO     Train Epoch: 2 [17920/60000 (89%)]\tloss=0.8277\n",
-      "2024-03-05T21:50:19Z INFO     Train Epoch: 2 [19200/60000 (96%)]\tloss=0.6978\n",
-      "2024-03-05T21:50:21Z INFO     Training is finished\n",
+      "2024-10-08T13:25:27Z INFO     Start training for RANK: 0. WORLD_SIZE: 4\n",
+      "100%|██████████| 4422102/4422102 [00:07<00:00, 581699.71it/s] \n",
+      "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
+      "Using downloaded and verified file: ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
+      "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
+      "\n",
+      "2024-10-08T13:25:28Z INFO     Start training for RANK: 1. WORLD_SIZE: 4\n",
+      "2024-10-08T13:25:29Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3060\n",
+      "2024-10-08T13:25:29Z INFO     Train Epoch: 0 [1280/60000 (8%)]\tloss=2.2977\n",
+      "2024-10-08T13:25:30Z INFO     Train Epoch: 0 [2560/60000 (17%)]\tloss=2.2900\n",
+      "2024-10-08T13:25:31Z INFO     Train Epoch: 0 [3840/60000 (25%)]\tloss=2.2840\n",
+      "2024-10-08T13:25:31Z INFO     Train Epoch: 0 [5120/60000 (34%)]\tloss=2.2755\n",
+      "2024-10-08T13:25:32Z INFO     Train Epoch: 0 [6400/60000 (42%)]\tloss=2.2699\n",
+      "2024-10-08T13:25:33Z INFO     Train Epoch: 0 [7680/60000 (51%)]\tloss=2.2518\n",
+      "2024-10-08T13:25:34Z INFO     Train Epoch: 0 [8960/60000 (59%)]\tloss=2.2496\n",
+      "2024-10-08T13:25:34Z INFO     Train Epoch: 0 [10240/60000 (68%)]\tloss=2.2273\n",
+      "2024-10-08T13:25:35Z INFO     Train Epoch: 0 [11520/60000 (76%)]\tloss=2.1986\n",
+      "2024-10-08T13:25:36Z INFO     Train Epoch: 0 [12800/60000 (85%)]\tloss=2.1755\n",
+      "2024-10-08T13:25:37Z INFO     Train Epoch: 0 [14080/60000 (93%)]\tloss=2.1046\n",
+      "2024-10-08T13:25:37Z INFO     Train Epoch: 1 [0/60000 (0%)]\tloss=2.0498\n",
+      "2024-10-08T13:25:38Z INFO     Train Epoch: 1 [1280/60000 (8%)]\tloss=1.9492\n",
+      "2024-10-08T13:25:39Z INFO     Train Epoch: 1 [2560/60000 (17%)]\tloss=1.8004\n",
+      "2024-10-08T13:25:39Z INFO     Train Epoch: 1 [3840/60000 (25%)]\tloss=1.6088\n",
+      "2024-10-08T13:25:40Z INFO     Train Epoch: 1 [5120/60000 (34%)]\tloss=1.3772\n",
+      "2024-10-08T13:25:41Z INFO     Train Epoch: 1 [6400/60000 (42%)]\tloss=1.2621\n",
+      "2024-10-08T13:25:41Z INFO     Train Epoch: 1 [7680/60000 (51%)]\tloss=1.1353\n",
+      "2024-10-08T13:25:42Z INFO     Train Epoch: 1 [8960/60000 (59%)]\tloss=1.0971\n",
+      "2024-10-08T13:25:43Z INFO     Train Epoch: 1 [10240/60000 (68%)]\tloss=1.0772\n",
+      "2024-10-08T13:25:44Z INFO     Train Epoch: 1 [11520/60000 (76%)]\tloss=1.0657\n",
+      "2024-10-08T13:25:44Z INFO     Train Epoch: 1 [12800/60000 (85%)]\tloss=1.0127\n",
+      "2024-10-08T13:25:45Z INFO     Train Epoch: 1 [14080/60000 (93%)]\tloss=0.9161\n",
+      "2024-10-08T13:25:46Z INFO     Train Epoch: 2 [0/60000 (0%)]\tloss=1.3036\n",
+      "2024-10-08T13:25:46Z INFO     Train Epoch: 2 [1280/60000 (8%)]\tloss=0.8902\n",
+      "2024-10-08T13:25:47Z INFO     Train Epoch: 2 [2560/60000 (17%)]\tloss=0.9369\n",
+      "2024-10-08T13:25:48Z INFO     Train Epoch: 2 [3840/60000 (25%)]\tloss=0.9562\n",
+      "2024-10-08T13:25:49Z INFO     Train Epoch: 2 [5120/60000 (34%)]\tloss=0.8001\n",
+      "2024-10-08T13:25:49Z INFO     Train Epoch: 2 [6400/60000 (42%)]\tloss=0.8546\n",
+      "2024-10-08T13:25:50Z INFO     Train Epoch: 2 [7680/60000 (51%)]\tloss=0.8226\n",
+      "2024-10-08T13:25:51Z INFO     Train Epoch: 2 [8960/60000 (59%)]\tloss=0.9489\n",
+      "2024-10-08T13:25:52Z INFO     Train Epoch: 2 [10240/60000 (68%)]\tloss=0.8800\n",
+      "2024-10-08T13:25:52Z INFO     Train Epoch: 2 [11520/60000 (76%)]\tloss=0.8957\n",
+      "2024-10-08T13:25:53Z INFO     Train Epoch: 2 [12800/60000 (85%)]\tloss=0.8961\n",
+      "2024-10-08T13:25:54Z INFO     Train Epoch: 2 [14080/60000 (93%)]\tloss=0.7958\n",
+      "2024-10-08T13:25:54Z INFO     Training is finished\n",
+      "2024-10-08T13:25:54Z INFO     Training is finished\n",
+      "[2024-10-08 13:25:59,467] torch.distributed.elastic.agent.server.api: [ERROR] Error waiting on exit barrier. Elapsed: 0.004681587219238281 seconds\n",
+      "[2024-10-08 13:25:59,468] torch.distributed.elastic.rendezvous.dynamic_rendezvous: [WARNING] The node 'train-pytorch-master-0_7_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.\n",
       "\n"
      ]
     }
@@ -649,16 +615,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "id": "32ae88fd-5b5d-4ba1-a560-9a35c5ac17de",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-05T21:50:49.741925Z",
-     "iopub.status.busy": "2024-03-05T21:50:49.741582Z",
-     "iopub.status.idle": "2024-03-05T21:50:49.772285Z",
-     "shell.execute_reply": "2024-03-05T21:50:49.771323Z",
-     "shell.execute_reply.started": "2024-03-05T21:50:49.741904Z"
-    },
     "tags": []
    },
    "outputs": [],
@@ -677,7 +636,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "training",
    "language": "python",
    "name": "python3"
   },
@@ -691,7 +650,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.9.20"
   }
  },
  "nbformat": 4,
diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py
index b987872524..808f07065b 100644
--- a/sdk/python/kubeflow/training/api/training_client.py
+++ b/sdk/python/kubeflow/training/api/training_client.py
@@ -128,8 +128,8 @@ def train(
             namespace: Namespace for the PyTorchJob. By default namespace is taken from
                 `TrainingClient` object.
             num_workers: Number of PyTorchJob workers.
-            num_procs_per_worker: Number of processes per PyTorchJob worker for `torchrun` CLI.
-                You can use this parameter if you want to use more than 1 GPU per PyTorchJob worker.
+            num_procs_per_worker: Number of processes per PyTorchJob worker for `torchrun` CLI. You
+                should use this parameter if you want to use more than 1 GPU per PyTorchJob worker.
             resources_per_worker: A parameter that lets you specify how much
                 resources each PyTorchJob worker container should have. You can either specify a
                 kubernetes.client.V1ResourceRequirements object (documented here:
@@ -322,7 +322,8 @@ def create_job(
         base_image: Optional[str] = None,
         train_func: Optional[Callable] = None,
         parameters: Optional[Dict[str, Any]] = None,
-        num_workers: Optional[int] = None,
+        num_workers: Optional[int] = 1,
+        num_procs_per_worker: Optional[Union[int, str]] = None,
         resources_per_worker: Union[dict, models.V1ResourceRequirements, None] = None,
         num_chief_replicas: Optional[int] = None,
         num_ps_replicas: Optional[int] = None,
@@ -355,6 +356,9 @@ def create_job(
                 set, Base Image must support `bash` CLI to execute the training script.
             parameters: Dict of input parameters that training function might receive.
             num_workers: Number of Worker replicas for the Job.
+            num_procs_per_worker: Number of processes per PyTorchJob worker for `torchrun` CLI. You
+                should use this parameter if you want to use more than 1 GPU per PyTorchJob worker.
+                Set to "auto" to automatically use available GPU/CPU PyTorch resources.
             resources_per_worker: A parameter that lets you specify how much
                 resources each Worker container should have. You can either specify a
                 kubernetes.client.V1ResourceRequirements object (documented here:
@@ -393,7 +397,8 @@ def create_job(
         if job is not None:
             for key, value in locals().items():
                 if (
-                    key not in ["self", "job", "namespace", "pip_index_url"]
+                    key
+                    not in ["self", "job", "namespace", "pip_index_url", "num_workers"]
                     and value is not None
                 ):
                     raise ValueError(
@@ -419,19 +424,44 @@ def create_job(
                     "Job name must be set to configure Job from function or image"
                 )
 
+            # Check if at least one Worker is set.
+            # TODO (andreyvelich): Remove this check once we have CEL validation.
+            # Ref: https://github.com/kubeflow/training-operator/issues/1708
+            if num_workers is None or num_workers < 1:
+                raise ValueError(f"At least one Worker for {job_kind} must be set")
+
             # Assign the default base image.
             # TODO (andreyvelich): Add base image for other Job kinds.
             if base_image is None:
                 base_image = constants.JOB_PARAMETERS[job_kind]["base_image"]
 
+            # By default we don't set command and args for the training container.
+            command, args = None, None
+
+            # If training function is set get the command and args.
+            if train_func is not None:
+                # Use `torchrun` for distributed PyTorch training, otherwise use `python`
+                if job_kind == constants.PYTORCHJOB_KIND and (
+                    num_workers > 1 or num_procs_per_worker is not None
+                ):
+                    entrypoint = constants.ENTRYPOINT_TORCH
+                else:
+                    entrypoint = constants.ENTRYPOINT_PYTHON
+
+                command, args = utils.get_command_using_train_func(
+                    train_func=train_func,
+                    entrypoint=entrypoint,
+                    train_func_parameters=parameters,
+                    packages_to_install=packages_to_install,
+                    pip_index_url=pip_index_url,
+                )
+
             # Get Training Container template.
             container_spec = utils.get_container_spec(
                 name=constants.JOB_PARAMETERS[job_kind]["container"],
                 base_image=base_image,
-                train_func=train_func,
-                train_func_parameters=parameters,
-                packages_to_install=packages_to_install,
-                pip_index_url=pip_index_url,
+                command=command,
+                args=args,
                 resources=resources_per_worker,
             )
 
@@ -443,6 +473,10 @@ def create_job(
             # Configure template for different Jobs.
             # TODO (andreyvelich): Add support for other kinds (e.g. MPIJob).
             if job_kind == constants.TFJOB_KIND:
+                if num_procs_per_worker is not None:
+                    raise ValueError(
+                        f"num_procs_per_worker can't be set for {constants.TFJOB_KIND}"
+                    )
                 job = utils.get_tfjob_template(
                     name=name,
                     namespace=namespace,
@@ -451,12 +485,18 @@ def create_job(
                     num_chief_replicas=num_chief_replicas,
                     num_ps_replicas=num_ps_replicas,
                 )
-            elif job_kind == constants.PYTORCHJOB_KIND and num_workers:
+            elif job_kind == constants.PYTORCHJOB_KIND:
+                if num_chief_replicas is not None or num_ps_replicas is not None:
+                    raise ValueError(
+                        "num_chief_replicas and num_ps_replicas can't be set for "
+                        f"{constants.PYTORCHJOB_KIND}"
+                    )
                 job = utils.get_pytorchjob_template(
                     name=name,
                     namespace=namespace,
                     worker_pod_template_spec=pod_template_spec,
                     num_workers=num_workers,
+                    num_procs_per_worker=num_procs_per_worker,
                 )
             else:
                 raise ValueError(
diff --git a/sdk/python/kubeflow/training/api/training_client_test.py b/sdk/python/kubeflow/training/api/training_client_test.py
index ea8c495032..a82d284e6f 100644
--- a/sdk/python/kubeflow/training/api/training_client_test.py
+++ b/sdk/python/kubeflow/training/api/training_client_test.py
@@ -1,5 +1,4 @@
 import multiprocessing
-from typing import Optional
 from unittest.mock import Mock, patch
 
 import pytest
@@ -10,7 +9,6 @@
     KubeflowOrgV1PyTorchJobSpec,
     KubeflowOrgV1ReplicaSpec,
     KubeflowOrgV1RunPolicy,
-    KubeflowOrgV1SchedulingPolicy,
     TrainingClient,
     constants,
 )
@@ -21,10 +19,11 @@
     V1ObjectMeta,
     V1PodSpec,
     V1PodTemplateSpec,
-    V1ResourceRequirements,
 )
 
 TEST_NAME = "test"
+TEST_IMAGE = "docker.io/test-training"
+
 TIMEOUT = "timeout"
 RUNTIME = "runtime"
 MOCK_POD_OBJ = "mock_pod_obj"
@@ -126,41 +125,20 @@ def get(self, timeout):
     return MockResponse()
 
 
-def generate_container() -> V1Container:
-    return V1Container(
-        name="pytorch",
-        image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
-        args=["--backend", "gloo"],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
-    )
-
-
-def generate_pytorchjob(
-    job_namespace: str,
-    master: KubeflowOrgV1ReplicaSpec,
-    worker: KubeflowOrgV1ReplicaSpec,
-    scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None,
-) -> KubeflowOrgV1PyTorchJob:
-    return KubeflowOrgV1PyTorchJob(
-        api_version=constants.API_VERSION,
-        kind=constants.PYTORCHJOB_KIND,
-        metadata=V1ObjectMeta(name="pytorchjob-mnist-ci-test", namespace=job_namespace),
-        spec=KubeflowOrgV1PyTorchJobSpec(
-            run_policy=KubeflowOrgV1RunPolicy(
-                clean_pod_policy="None",
-                scheduling_policy=scheduling_policy,
-            ),
-            pytorch_replica_specs={"Master": master, "Worker": worker},
-        ),
+def create_job(
+    command=None,
+    args=None,
+    num_workers=2,
+):
+    container = V1Container(
+        name=constants.PYTORCHJOB_CONTAINER,
+        image=TEST_IMAGE,
+        command=command,
+        args=args,
     )
 
-
-def create_job():
-    job_namespace = TEST_NAME
-    container = generate_container()
     master = KubeflowOrgV1ReplicaSpec(
         replicas=1,
-        restart_policy="OnFailure",
         template=V1PodTemplateSpec(
             metadata=V1ObjectMeta(
                 annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
@@ -169,20 +147,58 @@ def create_job():
         ),
     )
 
-    worker = KubeflowOrgV1ReplicaSpec(
-        replicas=1,
-        restart_policy="OnFailure",
-        template=V1PodTemplateSpec(
-            metadata=V1ObjectMeta(
-                annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
+    pytorch_replica_specs = {"Master": master}
+
+    # PyTorchJob always has 1 master and N-1 worker replicas.
+    if num_workers > 1:
+        pytorch_replica_specs["Worker"] = KubeflowOrgV1ReplicaSpec(
+            replicas=num_workers - 1,
+            template=V1PodTemplateSpec(
+                metadata=V1ObjectMeta(
+                    annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}
+                ),
+                spec=V1PodSpec(containers=[container]),
             ),
-            spec=V1PodSpec(containers=[container]),
+        )
+
+    pytorchjob = KubeflowOrgV1PyTorchJob(
+        api_version=constants.API_VERSION,
+        kind=constants.PYTORCHJOB_KIND,
+        metadata=V1ObjectMeta(name=TEST_NAME, namespace=TEST_NAME),
+        spec=KubeflowOrgV1PyTorchJobSpec(
+            run_policy=KubeflowOrgV1RunPolicy(clean_pod_policy=None),
+            pytorch_replica_specs=pytorch_replica_specs,
         ),
     )
-    pytorchjob = generate_pytorchjob(job_namespace, master, worker)
+
     return pytorchjob
 
 
+# Check if actual string contains all elements from the expected list.
+class AnyStringWithElementsFromList:
+    def __init__(self, expected):
+        self.expected = expected
+
+    def __eq__(self, actual):
+        return all(e in str(actual) for e in self.expected)
+
+
+def create_job_from_func(num_workers, packages_to_install=None, pip_index_url=None):
+
+    command = constants.DEFAULT_COMMAND
+    if num_workers > 1:
+        args = [f'{constants.ENTRYPOINT_TORCH} "$program_path/ephemeral_script.py"']
+    else:
+        args = [f'{constants.ENTRYPOINT_PYTHON} "$program_path/ephemeral_script.py"']
+
+    if pip_index_url and packages_to_install:
+        args += [f"--index-url {pip_index_url} {packages_to_install[0]}"]
+
+    job = create_job(command, AnyStringWithElementsFromList(args), num_workers)
+
+    return job
+
+
 def generate_job_with_status(
     job: constants.JOB_MODELS_TYPE,
     condition_type: str = constants.JOB_CONDITION_SUCCEEDED,
@@ -204,92 +220,148 @@ def __init__(self, kind) -> None:
 
 
 test_data_create_job = [
+    (
+        "valid flow",
+        {"job": create_job(), "namespace": TEST_NAME},
+        SUCCESS,
+        create_job(),
+    ),
+    (
+        "valid flow to create multi-node job with torchrun",
+        {
+            "name": TEST_NAME,
+            "namespace": TEST_NAME,
+            "train_func": lambda: print("Test Training Function"),
+            "base_image": TEST_IMAGE,
+            "num_workers": 3,
+            "packages_to_install": ["boto3==1.34.14"],
+            "pip_index_url": "https://pypi.custom.com/simple",
+        },
+        SUCCESS,
+        create_job_from_func(
+            num_workers=3,
+            packages_to_install=["boto3==1.34.1"],
+            pip_index_url="https://pypi.custom.com/simple",
+        ),
+    ),
+    (
+        "valid flow to create job with 1 worker",
+        {
+            "name": TEST_NAME,
+            "namespace": TEST_NAME,
+            "train_func": lambda: print("Test Training Function"),
+            "base_image": TEST_IMAGE,
+            "num_workers": 1,
+        },
+        SUCCESS,
+        create_job_from_func(num_workers=1),
+    ),
+    (
+        "valid flow to create job using image",
+        {
+            "name": TEST_NAME,
+            "namespace": TEST_NAME,
+            "base_image": TEST_IMAGE,
+            "num_workers": 2,
+        },
+        SUCCESS,
+        create_job(num_workers=2),
+    ),
     (
         "invalid extra parameter",
-        {"job": create_job(), "namespace": TEST_NAME, "base_image": "test_image"},
+        {
+            "job": create_job(),
+            "namespace": TEST_NAME,
+            "base_image": "test_image",
+        },
         ValueError,
+        None,
+    ),
+    (
+        "invalid job kind",
+        {"job_kind": "invalid_job_kind"},
+        ValueError,
+        None,
     ),
-    ("invalid job kind", {"job_kind": "invalid_job_kind"}, ValueError),
     (
-        "job name missing ",
+        "job name missing with train function",
         {"train_func": lambda: "test train function"},
         ValueError,
+        None,
+    ),
+    (
+        "job name missing with base image",
+        {"base_image": "test_image"},
+        ValueError,
+        None,
     ),
-    ("job name missing", {"base_image": "test_image"}, ValueError),
     (
         "uncallable train function",
-        {"name": "test job", "train_func": "uncallable train function"},
+        {
+            "name": TEST_NAME,
+            "train_func": "uncallable train function",
+        },
         ValueError,
+        None,
     ),
     (
-        "invalid TFJob replica",
+        "invalid number of workers",
         {
-            "name": "test job",
-            "train_func": lambda: "test train function",
+            "name": TEST_NAME,
+            "num_workers": 0,
+        },
+        ValueError,
+        None,
+    ),
+    (
+        "num_procs_per_worker is set for TFJob",
+        {
+            "name": TEST_NAME,
             "job_kind": constants.TFJOB_KIND,
+            "num_procs_per_worker": 5,
+            "base_image": "test_image",
         },
         ValueError,
+        None,
     ),
     (
-        "invalid PyTorchJob replica",
+        "num_chief_replicas and num_ps_replicas is set for PyTorchJov",
         {
-            "name": "test job",
-            "train_func": lambda: "test train function",
-            "job_kind": constants.PYTORCHJOB_KIND,
+            "name": TEST_NAME,
+            "num_chief_replicas": 1,
+            "num_ps_replicas": 1,
+            "base_image": "test_image",
         },
         ValueError,
+        None,
     ),
     (
         "paddle job can't be created using function",
         {
-            "name": "test job",
+            "name": TEST_NAME,
             "train_func": lambda: "test train function",
             "job_kind": constants.PADDLEJOB_KIND,
         },
         ValueError,
+        None,
     ),
     (
         "invalid job object",
         {"job": DummyJobClass(constants.TFJOB_KIND)},
         ValueError,
+        None,
     ),
     (
         "create_namespaced_custom_object timeout error",
         {"job": create_job(), "namespace": TIMEOUT},
         TimeoutError,
+        None,
     ),
     (
         "create_namespaced_custom_object runtime error",
         {"job": create_job(), "namespace": RUNTIME},
         RuntimeError,
-    ),
-    (
-        "valid flow",
-        {"job": create_job(), "namespace": TEST_NAME},
-        SUCCESS,
-    ),
-    (
-        "valid flow to create job from func",
-        {
-            "name": "test-job",
-            "namespace": TEST_NAME,
-            "train_func": lambda: print("Test Training Function"),
-            "base_image": "docker.io/test-training",
-            "num_workers": 3,
-            "packages_to_install": ["boto3==1.34.14"],
-            "pip_index_url": "https://pypi.custom.com/simple",
-        },
-        SUCCESS,
-    ),
-    (
-        "valid flow to create job using image",
-        {
-            "name": "test-job",
-            "namespace": TEST_NAME,
-            "base_image": "docker.io/test-training",
-            "num_workers": 2,
-        },
-        SUCCESS,
+        None,
     ),
 ]
 
@@ -962,15 +1034,26 @@ def training_client():
         yield client
 
 
-@pytest.mark.parametrize("test_name,kwargs,expected_output", test_data_create_job)
-def test_create_job(training_client, test_name, kwargs, expected_output):
+@pytest.mark.parametrize(
+    "test_name,kwargs,expected_output,expected_job", test_data_create_job
+)
+def test_create_job(training_client, test_name, kwargs, expected_output, expected_job):
     """
     test create_job function of training client
     """
     print("Executing test:", test_name)
     try:
         training_client.create_job(**kwargs)
+
         assert expected_output == SUCCESS
+
+        training_client.custom_api.create_namespaced_custom_object.assert_called_with(
+            constants.GROUP,
+            constants.VERSION,
+            kwargs["namespace"],
+            constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["plural"],
+            expected_job,
+        )
     except Exception as e:
         assert type(e) is expected_output
     print("test execution complete")
diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py
index dbbd885baa..dba4d49681 100644
--- a/sdk/python/kubeflow/training/constants/constants.py
+++ b/sdk/python/kubeflow/training/constants/constants.py
@@ -118,6 +118,10 @@
 PYTORCHJOB_REPLICA_TYPES = (REPLICA_TYPE_MASTER.lower(), REPLICA_TYPE_WORKER.lower())
 PYTORCHJOB_BASE_IMAGE = "docker.io/pytorch/pytorch:2.1.2-cuda11.8-cudnn8-runtime"
 
+ENTRYPOINT_TORCH = "torchrun"
+ENTRYPOINT_PYTHON = "python -u"
+DEFAULT_COMMAND = ["bash", "-c"]
+
 # XGBoostJob constants
 XGBOOSTJOB_KIND = "XGBoostJob"
 XGBOOSTJOB_MODEL = "KubeflowOrgV1XGBoostJob"
diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py
index 56011e94aa..4f2d9c97d7 100644
--- a/sdk/python/kubeflow/training/utils/utils.py
+++ b/sdk/python/kubeflow/training/utils/utils.py
@@ -120,10 +120,10 @@ def get_script_for_python_packages(
     script_for_python_packages = textwrap.dedent(
         f"""
         if ! [ -x "$(command -v pip)" ]; then
-            python3 -m ensurepip || python3 -m ensurepip --user || apt-get install python3-pip
+            python -m ensurepip || python -m ensurepip --user || apt-get install python-pip
         fi
 
-        PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet \
+        PIP_DISABLE_PIP_VERSION_CHECK=1 python -m pip install --quiet \
         --no-warn-script-location --index-url {pip_index_url} {packages_str}
         """
     )
@@ -132,7 +132,8 @@ def get_script_for_python_packages(
 
 
 def get_command_using_train_func(
-    train_func: Optional[Callable],
+    train_func: Callable,
+    entrypoint: str,
     train_func_parameters: Optional[Dict[str, Any]] = None,
     packages_to_install: Optional[List[str]] = None,
     pip_index_url: str = constants.DEFAULT_PIP_INDEX_URL,
@@ -170,11 +171,11 @@ def get_command_using_train_func(
                 {func_code}
                 EOM
                 printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\"
-                python3 -u \"$program_path/ephemeral_script.py\""""
+                {entrypoint} \"$program_path/ephemeral_script.py\""""
     )
 
     # Add function code to the execute script.
-    exec_script = exec_script.format(func_code=func_code)
+    exec_script = exec_script.format(func_code=func_code, entrypoint=entrypoint)
 
     # Install Python packages if that is required.
     if packages_to_install is not None:
@@ -184,16 +185,13 @@ def get_command_using_train_func(
         )
 
     # Return container command and args to execute training function.
-    return ["bash", "-c"], [exec_script]
+    return constants.DEFAULT_COMMAND, [exec_script]
 
 
 def get_container_spec(
     name: str,
     base_image: str,
-    train_func: Optional[Callable] = None,
-    train_func_parameters: Optional[Dict[str, Any]] = None,
-    packages_to_install: Optional[List[str]] = None,
-    pip_index_url: str = constants.DEFAULT_PIP_INDEX_URL,
+    command: Optional[List[str]] = None,
     args: Optional[List[str]] = None,
     resources: Union[dict, models.V1ResourceRequirements, None] = None,
     volume_mounts: Optional[List[models.V1VolumeMount]] = None,
@@ -207,18 +205,13 @@ def get_container_spec(
 
     # Create initial container spec.
     container_spec = models.V1Container(
-        name=name, image=base_image, args=args, volume_mounts=volume_mounts
+        name=name,
+        image=base_image,
+        command=command,
+        args=args,
+        volume_mounts=volume_mounts,
     )
 
-    # If training function is set, override container command and args to execute the function.
-    if train_func is not None:
-        container_spec.command, container_spec.args = get_command_using_train_func(
-            train_func=train_func,
-            train_func_parameters=train_func_parameters,
-            packages_to_install=packages_to_install,
-            pip_index_url=pip_index_url,
-        )
-
     # Convert dict to the Kubernetes container resources if that is required.
     if isinstance(resources, dict):
         # Convert all keys in resources to lowercase.
@@ -265,15 +258,10 @@ def get_tfjob_template(
     name: str,
     namespace: str,
     pod_template_spec: models.V1PodTemplateSpec,
-    num_workers: Optional[int] = None,
+    num_workers: int,
     num_chief_replicas: Optional[int] = None,
     num_ps_replicas: Optional[int] = None,
 ):
-    # Check if at least one replica is set.
-    # TODO (andreyvelich): Remove this check once we have CEL validation.
-    # Ref: https://github.com/kubeflow/training-operator/issues/1708
-    if num_workers is None and num_chief_replicas is None and num_ps_replicas is None:
-        raise ValueError("At least one replica for TFJob must be set")
 
     # Create TFJob template.
     tfjob = models.KubeflowOrgV1TFJob(
@@ -320,14 +308,8 @@ def get_pytorchjob_template(
     num_workers: int,
     worker_pod_template_spec: Optional[models.V1PodTemplateSpec],
     master_pod_template_spec: Optional[models.V1PodTemplateSpec] = None,
-    num_procs_per_worker: Optional[int] = None,
-    elastic_policy: Optional[models.KubeflowOrgV1ElasticPolicy] = None,
+    num_procs_per_worker: Optional[Union[int, str]] = None,
 ):
-    # Check if at least one Worker is set.
-    # TODO (andreyvelich): Remove this check once we have CEL validation.
-    # Ref: https://github.com/kubeflow/training-operator/issues/1708
-    if num_workers is None or num_workers < 1:
-        raise ValueError("At least one Worker for PyTorchJob must be set")
 
     # Create PyTorchJob template.
     pytorchjob = models.KubeflowOrgV1PyTorchJob(
@@ -337,11 +319,9 @@ def get_pytorchjob_template(
         spec=models.KubeflowOrgV1PyTorchJobSpec(
             run_policy=models.KubeflowOrgV1RunPolicy(clean_pod_policy=None),
             pytorch_replica_specs={},
-            elastic_policy=elastic_policy,
         ),
     )
 
-    # TODO (andreyvelich): Should we make spec.nproc_per_node int ?
     if num_procs_per_worker:
         pytorchjob.spec.nproc_per_node = str(num_procs_per_worker)