Skip to content

Commit

Permalink
code review changes
Browse files Browse the repository at this point in the history
  • Loading branch information
deepanker13 committed Dec 21, 2023
1 parent e4df199 commit 7802b2d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 22 deletions.
10 changes: 2 additions & 8 deletions sdk/python/kubeflow/storage_init_container/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,18 @@ def dataset_factory(dataset_provider, dataset_provider_parameters):
parser = argparse.ArgumentParser(
description="script for downloading model and datasets to PVC."
)
parser.add_argument(
"--model_provider", type=str, help="name of model provider", required=False
)
parser.add_argument("--model_provider", type=str, help="name of model provider")
parser.add_argument(
"--model_provider_parameters",
type=str,
help="model provider serialised arguments",
required=False,
)

parser.add_argument(
"--dataset_provider", type=str, help="name of dataset provider", required=False
)
parser.add_argument("--dataset_provider", type=str, help="name of dataset provider")
parser.add_argument(
"--dataset_provider_parameters",
type=str,
help="dataset provider serialised arguments",
required=False,
)
args = parser.parse_args()

Expand Down
36 changes: 22 additions & 14 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def train(
"""
if (
not name
or not namespace
or not storage_config
or not model_provider_parameters
or not dataset_provider_parameters
Expand All @@ -115,6 +114,27 @@ def train(
):
raise ValueError("One of the required parameters is None")

namespace = namespace or self.namespace

if "cpu" not in resources_per_worker or "memory" not in resources_per_worker:
raise ValueError("cpu and memory resources not specified")
else:
limits = {
"cpu": resources_per_worker["cpu"],
"memory": resources_per_worker["memory"],
}

if (
resources_per_worker["gpu"] is not None
and num_procs_per_worker > resources_per_worker["gpu"]
) or (resources_per_worker["gpu"] is None and num_procs_per_worker != 0):
raise ValueError("Insufficient gpu resources allocated to the container.")

if "gpu" in resources_per_worker:
limits["nvidia.com/gpu"] = resources_per_worker["gpu"]

requests = limits.copy()

try:
self.core_api.create_namespaced_persistent_volume_claim(
namespace=namespace,
Expand All @@ -128,12 +148,6 @@ def train(
except Exception as e:
print(e)

if (
resources_per_worker["gpu"] is not None
and num_procs_per_worker > resources_per_worker["gpu"]
) or (resources_per_worker["gpu"] is None and num_procs_per_worker != 0):
raise ValueError("Insufficient gpu resources allocated to the container.")

if isinstance(model_provider_parameters, HuggingFaceModelParams):
mp = "hf"

Expand Down Expand Up @@ -169,13 +183,7 @@ def train(
volume_mounts=[
models.V1VolumeMount(name=constants.TRAINER_PV, mount_path="/workspace")
],
resources=models.V1ResourceRequirements(
limits={
"nvidia.com/gpu": resources_per_worker["gpu"],
"cpu": resources_per_worker["cpu"],
"memory": resources_per_worker["memory"],
}
),
resources=models.V1ResourceRequirements(requests=requests, limits=limits),
)

# create worker pod spec
Expand Down

0 comments on commit 7802b2d

Please sign in to comment.