redhat-et · Shreyanand · Sep 6, 2024 · Sep 9, 2024 · tumido · Sep 9, 2024
diff --git a/training/component.yaml b/training/component.yaml
@@ -0,0 +1,40 @@
+description: Kubeflow PyTorchJob launcher
+inputs:
+- {name: name,                       type: String,                          description: 'PyTorchJob name.'}
+- {name: namespace,                  type: String,      default: kubeflow,  description: 'PyTorchJob namespace (likely your current namespace).'}
+- {name: version,                    type: String,      default: v1,        description: 'PyTorchJob version.'}
+- {name: master_spec,                type: JsonObject,  default: '{}',      description: 'PyTorchJob Master replicaSpecs.'}
+- {name: worker_spec,                type: JsonObject,  default: '{}',      description: 'PyTorchJob Worker replicaSpecs.'}
+- {name: job_timeout_minutes,        type: Integer,     default: 1440,      description: 'Time in minutes to wait for the job to complete.'}
+- {name: delete_after_done,          type: Boolean,     default: 'True' ,   description: 'Whether to delete the job after it is finished.'}
+- {name: clean_pod_policy,           type: String,      default: Running,   description: 'Defines the policy for cleaning up pods after the PyTorchJob completes.'}
+- {name: active_deadline_seconds,    type: Integer,     optional: true,     description: 'Specifies the duration (in seconds) since startTime during which the job can remain active before it is terminated. Must be a positive integer. This setting applies only to pods where restartPolicy is OnFailure or Always.'}
+- {name: backoff_limit,              type: Integer,     optional: true,     description: 'Number of retries before marking this job as failed.'}
+- {name: ttl_seconds_after_finished, type: Integer,     optional: true,     description: 'Defines the TTL for cleaning up finished PyTorchJobs.'}
+implementation:
+  container:
+    image: cascribner/kubeflow-pytorchjob-launcher:v1
+    command: [python, /ml/launch_pytorchjob.py]
+    args: 
+      - --name
+      - {inputValue: name}
+      - --namespace
+      - {inputValue: namespace}
+      - --version
+      - {inputValue: version}
+      - --masterSpec
+      - {inputValue: master_spec}
+      - --workerSpec
+      - {inputValue: worker_spec}
+      - --jobTimeoutMinutes
+      - {inputValue: job_timeout_minutes}
+      - --deleteAfterDone
+      - {inputValue: delete_after_done}
+      - --cleanPodPolicy
+      - {inputValue: clean_pod_policy}
+      - --activeDeadlineSeconds
+      - {inputValue: active_deadline_seconds}
+      - --backoffLimit
+      - {inputValue: backoff_limit}
+      - --ttlSecondsAfterFinished
+      - {inputValue: ttl_seconds_after_finished}
diff --git a/training/pipeline.py b/training/pipeline.py
@@ -0,0 +1,148 @@
+from typing import NamedTuple
+import kfp.dsl as dsl
+from kfp import components
+
+@dsl.component(base_image="python:slim")
+def create_worker_spec(worker_num: int = 0) -> NamedTuple(
+    "CreatWorkerSpec", [("worker_spec", dict)]):
+    """
+    Creates pytorch-job worker spec
+    """
+    from collections import namedtuple
+    worker = {}
+    if worker_num > 0:
+        worker = {
+            "replicas": worker_num,
+            "restartPolicy": "OnFailure",
+            "template": {
+                "metadata": {
+                    "annotations": {
+                        "sidecar.istio.io/inject": "false"
+                    }
+                },
+                "spec": {
+                    "containers": [
+                        {   "command": [
+                            '/bin/bash',
+                            '-c',
+                            '--'
+                            ],
+                            "args": [
+                            "python3.11 -u run.py"
+                            ],
+                            "image": "quay.io/michaelclifford/test-train:0.0.11",
+                            "name": "pytorch",
+                            "resources": {
+                                "requests": {
+                                    "memory": "8Gi",
+                                    "cpu": "2000m",
+                                    # Uncomment for GPU
+                                    "nvidia.com/gpu": 1,
+                                },
+                                "limits": {
+                                    "memory": "8Gi",
+                                    "cpu": "2000m",
+                                    # Uncomment for GPU
+                                    "nvidia.com/gpu": 1,
+                                },
+                            },
+                        }
+                    ]
+                },
+            },
+        }
+
+    worker_spec_output = namedtuple(
+        "MyWorkerOutput", ["worker_spec"]
+    )
+    return worker_spec_output(worker)
+
+@dsl.pipeline(
+    name="launch-kubeflow-pytorchjob",
+    description="An example to launch pytorch.",
+)
+def ilab_train(
+    namespace: str = "mcliffor",
+    worker_replicas: int = 1,
+    ttl_seconds_after_finished: int = -1,
+    job_timeout_minutes: int = 600,
+    delete_after_done: bool = False):
+
+    pytorchjob_launcher_op = components.load_component_from_file("component.yaml")
+
+    master = {
+        "replicas": 1,
+        "restartPolicy": "OnFailure",
+        "template": {
+            "metadata": {
+                "annotations": {
+                    # See https://github.com/kubeflow/website/issues/2011
+                    "sidecar.istio.io/inject": "false"
+                }
+            },
+            "spec": {
+                "containers": [
+                    {
+                        # To override default command
+                       "command": [
+                            '/bin/bash',
+                            '-c',
+                            '--'
+                            ],
+                        "args": [
+                            "python3.11 -u run.py"
+                            ],
+                        # Or, create your own image from
+                        # https://github.com/kubeflow/pytorch-operator/tree/master/examples/mnist
+                        "image": "quay.io/michaelclifford/test-train:0.0.11",
+                        "name": "pytorch",
+                        "resources": {
+                            "requests": {
+                                "memory": "8Gi",
+                                "cpu": "2000m",
+                                # Uncomment for GPU
+                                "nvidia.com/gpu": 1,
+                            },
+                            "limits": {
+                                "memory": "8Gi",
+                                "cpu": "2000m",
+                                # Uncomment for GPU
+                                "nvidia.com/gpu": 1,
+                            },
+                        },
+                    }
+                ],
+                # If imagePullSecrets required
+                # "imagePullSecrets": [
+                #     {"name": "image-pull-secret"},
+                # ],
+            },
+        },
+    }
+
+    worker_spec_create = create_worker_spec(worker_num=worker_replicas)
+
+    # Launch and monitor the job with the launcher
+    pytorchjob_launcher_op(
+        name="pytorch-job",
+        namespace=namespace,
+        master_spec=master,
+        worker_spec = worker_spec_create.outputs["worker_spec"],
+        ttl_seconds_after_finished=ttl_seconds_after_finished,
+        job_timeout_minutes=job_timeout_minutes,
+        delete_after_done=delete_after_done,
+        active_deadline_seconds=100,
+        backoff_limit=1
+    )
+
+
+if __name__ == "__main__":
+    import kfp.compiler as compiler
+
+    pipeline_file = "pipeline.yaml"
+    print(
+        f"Compiling pipeline as {pipeline_file}"
+    )
+    compiler.Compiler().compile(
+        ilab_train, pipeline_file
+    )
-    pipeline_file = "pipeline.yaml"
-    print(
-        f"Compiling pipeline as {pipeline_file}"
-    )
-    compiler.Compiler().compile(
-        ilab_train, pipeline_file
-    )
+    pipeline_file = "pipeline.yaml"
+    print(f"Compiling pipeline as {pipeline_file}")
+    compiler.Compiler().compile(ilab_train, pipeline_file)
-    pipeline_file = "pipeline.yaml"
-    print(
-        f"Compiling pipeline as {pipeline_file}"
-    )
-    compiler.Compiler().compile(
-        ilab_train, pipeline_file
-    )
+    pipeline_file = "pipeline.yaml"
+    print(f"Compiling pipeline as {pipeline_file}")
+    compiler.Compiler().compile(ilab_train, pipeline_file)