Skip to content

Commit

Permalink
initial skeleton of train api
Browse files Browse the repository at this point in the history
  • Loading branch information
deepanker13 committed Dec 7, 2023
1 parent 4551c71 commit 6e92c75
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 3 deletions.
73 changes: 73 additions & 0 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import multiprocessing
import logging
import time
import json
from typing import Optional, Callable, List, Dict, Any, Set
import queue
from kubernetes import client, config, watch
Expand Down Expand Up @@ -81,6 +82,78 @@ def __init__(
)
self.job_kind = job_kind

def train(
self,
name=None,
namespace=None,
workers=1,
model_args=None,
dataset_args=None,
parameters=None,
resources_per_worker={"gpu": 0, "cpu": 0, "memory": "10Gi"},
):
"""
Higher level train api
"""
if not name or not namespace:
raise ValueError("job name or namespace cannot be null")

# create init container spec
init_container_spec = utils.get_container_spec(
name=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["init_container"],
image=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND][
"init_container_image"
],
args=[
"--model_provider",
mp,
"--model_provider_args",
json.dumps(model_args.__dict__),
"--dataset_provider",
dp,
"--dataset_provider_args",
json.dumps(dataset_args.__dict__),
],
volume_mounts=models.V1VolumeMount(),
)

# create app container spec
container_spec = utils.get_container_spec(
name=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND]["container"],
image=constants.JOB_PARAMETERS[constants.PYTORCHJOB_KIND][
"train_container_image"
],
args=["--parameters", json.dumps(parameters.__dict__)],
volume_mounts=models.V1VolumeMount(),
resources=resources_per_worker,
)

# create worker pod spec
worker_pod_template_spec = utils.get_pod_template_spec(
job_kind=constants.PYTORCHJOB_KIND,
containers_spec=[container_spec],
volumes_spec=[models.V1Volume()],
)

# create master pod spec
master_pod_template_spec = utils.get_pod_template_spec(
job_kind=constants.PYTORCHJOB_KIND,
containers_spec=[init_container_spec, container_spec],
volumes_spec=[models.V1Volume()],
)

job = utils.get_pytorchjob_template(
name=name,
namespace=namespace,
master_pod_template_spec=master_pod_template_spec,
worker_pod_template_spec=worker_pod_template_spec,
num_worker_replicas=workers,
num_procs_per_worker=resources_per_worker["gpu"],
elastic_policy=models.KubeflowOrgV1ElasticPolicy(rdzv_backend="c10d"),
)

self.create_job(job)

def create_job(
self,
job: Optional[constants.JOB_MODELS_TYPE] = None,
Expand Down
9 changes: 6 additions & 3 deletions sdk/python/kubeflow/training/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,11 @@
PYTORCHJOB_KIND = "PyTorchJob"
PYTORCHJOB_PLURAL = "pytorchjobs"
PYTORCHJOB_CONTAINER = "pytorch"
PYTORCHJOB_STORAGE_CONTAINER = "pytorch-storage"
PYTORCHJOB_REPLICA_TYPES = (REPLICA_TYPE_MASTER.lower(), REPLICA_TYPE_WORKER.lower())

PYTORCHJOB_BASE_IMAGE = "docker.io/pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
PYTORCHJOB_STORAGE_CONTAINER = "pytorch-storage"
PYTORCHJOB_STORAGE_CONTAINER_IMAGE = "docker image path"
PYTORCHJOB_TRAIN_CONTAINER_IMAGE = "docker image path"

# MXJob constants
MXJOB_KIND = "MXJob"
Expand Down Expand Up @@ -129,7 +130,9 @@
"plural": PYTORCHJOB_PLURAL,
"container": PYTORCHJOB_CONTAINER,
"base_image": PYTORCHJOB_BASE_IMAGE,
"init_container": PYTORCHJOB_STORAGE_CONTAINER
"init_container": PYTORCHJOB_STORAGE_CONTAINER,
"init_container_image": PYTORCHJOB_STORAGE_CONTAINER_IMAGE,
"train_container_image": PYTORCHJOB_TRAIN_CONTAINER_IMAGE,
},
MXJOB_KIND: {
"model": models.KubeflowOrgV1MXJob,
Expand Down

0 comments on commit 6e92c75

Please sign in to comment.