diff --git a/sdk/python/kubeflow/storage_init_container/storage.py b/sdk/python/kubeflow/storage_init_container/storage.py index cee8e22122..c5ecd1743a 100644 --- a/sdk/python/kubeflow/storage_init_container/storage.py +++ b/sdk/python/kubeflow/storage_init_container/storage.py @@ -27,24 +27,18 @@ def dataset_factory(dataset_provider, dataset_provider_parameters): parser = argparse.ArgumentParser( description="script for downloading model and datasets to PVC." ) - parser.add_argument( - "--model_provider", type=str, help="name of model provider", required=False - ) + parser.add_argument("--model_provider", type=str, help="name of model provider") parser.add_argument( "--model_provider_parameters", type=str, help="model provider serialised arguments", - required=False, ) - parser.add_argument( - "--dataset_provider", type=str, help="name of dataset provider", required=False - ) + parser.add_argument("--dataset_provider", type=str, help="name of dataset provider") parser.add_argument( "--dataset_provider_parameters", type=str, help="dataset provider serialised arguments", - required=False, ) args = parser.parse_args() diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index bdd99c5ad5..38af8d7969 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -106,7 +106,6 @@ def train( """ if ( not name - or not namespace or not storage_config or not model_provider_parameters or not dataset_provider_parameters @@ -115,6 +114,27 @@ def train( ): raise ValueError("One of the required parameters is None") + namespace = namespace or self.namespace + + if "cpu" not in resources_per_worker or "memory" not in resources_per_worker: + raise ValueError("cpu and memory resources not specified") + else: + limits = { + "cpu": resources_per_worker["cpu"], + "memory": resources_per_worker["memory"], + } + + if ( + resources_per_worker["gpu"] is not None + and num_procs_per_worker > resources_per_worker["gpu"] + ) or (resources_per_worker["gpu"] is None and num_procs_per_worker != 0): + raise ValueError("Insufficient gpu resources allocated to the container.") + + if "gpu" in resources_per_worker: + limits["nvidia.com/gpu"] = resources_per_worker["gpu"] + + requests = limits.copy() + try: self.core_api.create_namespaced_persistent_volume_claim( namespace=namespace, @@ -128,12 +148,6 @@ def train( except Exception as e: print(e) - if ( - resources_per_worker["gpu"] is not None - and num_procs_per_worker > resources_per_worker["gpu"] - ) or (resources_per_worker["gpu"] is None and num_procs_per_worker != 0): - raise ValueError("Insufficient gpu resources allocated to the container.") - if isinstance(model_provider_parameters, HuggingFaceModelParams): mp = "hf" @@ -169,13 +183,7 @@ def train( volume_mounts=[ models.V1VolumeMount(name=constants.TRAINER_PV, mount_path="/workspace") ], - resources=models.V1ResourceRequirements( - limits={ - "nvidia.com/gpu": resources_per_worker["gpu"], - "cpu": resources_per_worker["cpu"], - "memory": resources_per_worker["memory"], - } - ), + resources=models.V1ResourceRequirements(requests=requests, limits=limits), ) # create worker pod spec