torchrun example with cpu version pytorch (kubeflow#1965)

deepanker13 · Dec 18, 2023 · b938905 · b938905
1 parent 1400f1f
commit b938905
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 0 deletions.
diff --git a/examples/pytorch/cpu-demo/Dockerfile b/examples/pytorch/cpu-demo/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.8
+
+RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+
+WORKDIR /
+
+COPY demo.py .
diff --git a/examples/pytorch/cpu-demo/README.MD b/examples/pytorch/cpu-demo/README.MD
@@ -0,0 +1,7 @@
+## Demo
+
+This demo presents the usage of `torchrun` with training-operator.
+
+> Make the `nprocPerNode` part consistent with the gpu resource declaration in GPU context.
+
+The image used in demo.yaml is constructed with the Dockerfile provided alongside.
diff --git a/examples/pytorch/cpu-demo/demo.py b/examples/pytorch/cpu-demo/demo.py
@@ -0,0 +1,10 @@
+import torch
+torch.distributed.init_process_group(init_method="env://")
+rank = torch.distributed.get_rank()
+world_size = torch.distributed.get_world_size()
+print(f"rank {rank} world_size {world_size}")
+a = torch.tensor([1])
+torch.distributed.all_reduce(a)
+print(f"rank {rank} world_size {world_size} result {a}")
+torch.distributed.barrier()
+print(f"rank {rank} world_size {world_size}")
diff --git a/examples/pytorch/cpu-demo/demo.yaml b/examples/pytorch/cpu-demo/demo.yaml
@@ -0,0 +1,31 @@
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: torchrun-cpu
+spec:
+  nprocPerNode: "2"
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: pytorch-cpu:py3.8
+              imagePullPolicy: Always
+              command:
+                - "torchrun"
+                - "demo.py"
+    Worker:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: pytorch-cpu:py3.8
+              imagePullPolicy: Always
+              command:
+                - "torchrun"
+                - "demo.py"