Skip to content

Commit

Permalink
initial skeleton of train api
Browse files Browse the repository at this point in the history
  • Loading branch information
deepanker13 committed Dec 7, 2023
1 parent 4551c71 commit 43fbe41
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 3 deletions.
70 changes: 70 additions & 0 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import multiprocessing
import logging
import time
import json
from typing import Optional, Callable, List, Dict, Any, Set
import queue
from kubernetes import client, config, watch
Expand Down Expand Up @@ -81,6 +82,65 @@ def __init__(
)
self.job_kind = job_kind

def train(
self,
name=None,
namespace=None,
workers=1,
model_args=None,
dataset_args=None,
parameters=None,
resources_per_worker={"gpu": 0, "cpu": 0, "memory": "10Gi"},
):
"""
Higher level train api
"""
if not name or not namespace:
raise ValueError("job name or namespace cannot be null")

# create init container spec
init_container_spec = utils.get_container_spec(
name=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["init_container"],
image=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["init_container_image"],
args=["--model_provider", mp, "--model_provider_args",json.dumps(model_args.__dict__) , "--dataset_provider", dp, "--dataset_provider_args",json.dumps(dataset_args.__dict__)],
volume_mounts=models.V1VolumeMount(),
)

# create app container spec
container_spec = utils.get_container_spec(
name=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["container"],
image=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["train_container_image"],
args=["--parameters", json.dumps(parameters.__dict__)],
volume_mounts=models.V1VolumeMount(),
resources=resources_per_worker,
)

# create worker pod spec
worker_pod_template_spec = utils.get_pod_template_spec(
job_kind=constants.PYTORCHJOB_KIND,
containers_spec=[container_spec],
volumes_spec=[models.V1Volume()],
)

# create master pod spec
master_pod_template_spec = utils.get_pod_template_spec(
job_kind=constants.PYTORCHJOB_KIND,
containers_spec=[init_container_spec, container_spec],
volumes_spec=[models.V1Volume()],
)

job = utils.get_pytorchjob_template(
name=name,
namespace=namespace,
master_pod_template_spec= master_pod_template_spec,
worker_pod_template_spec=worker_pod_template_spec,
num_worker_replicas=workers,
num_procs_per_worker=resources_per_worker["gpu"],
elastic_policy=models.KubeflowOrgV1ElasticPolicy(rdzv_backend="c10d"),
)

self.create_job(job)

def create_job(
self,
job: Optional[constants.JOB_MODELS_TYPE] = None,
Expand Down Expand Up @@ -191,9 +251,19 @@ def create_job(
job = utils.get_pytorchjob_template(
name=name,
namespace=namespace,
<<<<<<< Updated upstream
worker_pod_template_spec=pod_template_spec,
=======
<<<<<<< Updated upstream
pod_template_spec=pod_template_spec,
>>>>>>> Stashed changes
num_worker_replicas=num_worker_replicas,
)
=======
worker_pod_template_spec=pod_template_spec,
num_worker_replicas=num_worker_replicas
)
>>>>>>> Stashed changes
else:
raise ValueError(
f"Job kind {job_kind} can't be created using function or image"
Expand Down
9 changes: 6 additions & 3 deletions sdk/python/kubeflow/training/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,11 @@
PYTORCHJOB_KIND = "PyTorchJob"
PYTORCHJOB_PLURAL = "pytorchjobs"
PYTORCHJOB_CONTAINER = "pytorch"
PYTORCHJOB_STORAGE_CONTAINER = "pytorch-storage"
PYTORCHJOB_REPLICA_TYPES = (REPLICA_TYPE_MASTER.lower(), REPLICA_TYPE_WORKER.lower())

PYTORCHJOB_BASE_IMAGE = "docker.io/pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
PYTORCHJOB_STORAGE_CONTAINER = "pytorch-storage"
PYTORCHJOB_STORAGE_CONTAINER_IMAGE = "docker image path"
PYTORCHJOB_TRAIN_CONTAINER_IMAGE = "docker image path"

# MXJob constants
MXJOB_KIND = "MXJob"
Expand Down Expand Up @@ -129,7 +130,9 @@
"plural": PYTORCHJOB_PLURAL,
"container": PYTORCHJOB_CONTAINER,
"base_image": PYTORCHJOB_BASE_IMAGE,
"init_container": PYTORCHJOB_STORAGE_CONTAINER
"init_container": PYTORCHJOB_STORAGE_CONTAINER,
"init_container_image": PYTORCHJOB_STORAGE_CONTAINER_IMAGE,
"train_container_image": PYTORCHJOB_TRAIN_CONTAINER_IMAGE
},
MXJOB_KIND: {
"model": models.KubeflowOrgV1MXJob,
Expand Down

0 comments on commit 43fbe41

Please sign in to comment.