diff --git a/centml/cli/cluster.py b/centml/cli/cluster.py index bbb55ad..7e8a16c 100644 --- a/centml/cli/cluster.py +++ b/centml/cli/cluster.py @@ -7,21 +7,26 @@ from centml.sdk.api import get_centml_client +# convert deployment type enum to a user friendly name depl_type_to_name_map = { - DeploymentType.INFERENCE: 'inference', - DeploymentType.COMPUTE: 'compute', - DeploymentType.COMPILATION: 'compilation', - DeploymentType.INFERENCE_V2: 'inference', - DeploymentType.COMPUTE_V2: 'compute', - DeploymentType.CSERVE: 'cserve', - DeploymentType.CSERVE_V2: 'cserve', - DeploymentType.RAG: 'rag', + DeploymentType.INFERENCE: "inference", + DeploymentType.COMPUTE: "compute", + DeploymentType.COMPILATION: "compilation", + DeploymentType.INFERENCE_V2: "inference", + DeploymentType.INFERENCE_V3: "inference", + DeploymentType.COMPUTE_V2: "compute", + # For user, they are all cserve. + DeploymentType.CSERVE: "cserve", + DeploymentType.CSERVE_V2: "cserve", + DeploymentType.CSERVE_V3: "cserve", + DeploymentType.RAG: "rag", } +# use latest type to for user requests depl_name_to_type_map = { - 'inference': DeploymentType.INFERENCE_V2, - 'cserve': DeploymentType.CSERVE_V2, - 'compute': DeploymentType.COMPUTE_V2, - 'rag': DeploymentType.RAG, + "inference": DeploymentType.INFERENCE_V3, + "cserve": DeploymentType.CSERVE_V3, + "compute": DeploymentType.COMPUTE_V2, + "rag": DeploymentType.RAG, } @@ -56,6 +61,21 @@ def _format_ssh_key(ssh_key): return ssh_key[:32] + "..." +def _get_replica_info(deployment): + """Extract replica information handling V2/V3 field differences""" + # Check actual deployment object fields rather than depl_type + # since unified get_cserve() can return either V2 or V3 objects + if hasattr(deployment, 'min_replicas'): + # V3 deployment response object + return {"min": deployment.min_replicas, "max": deployment.max_replicas} + elif hasattr(deployment, 'min_scale'): + # V2 deployment response object + return {"min": deployment.min_scale, "max": deployment.max_scale} + else: + # Fallback - shouldn't happen + return {"min": "N/A", "max": "N/A"} + + def _get_ready_status(cclient, deployment): api_status = deployment.status service_status = ( @@ -121,12 +141,12 @@ def get(type, id): with get_centml_client() as cclient: depl_type = depl_name_to_type_map[type] - if depl_type == DeploymentType.INFERENCE_V2: - deployment = cclient.get_inference(id) + if depl_type in [DeploymentType.INFERENCE_V2, DeploymentType.INFERENCE_V3]: + deployment = cclient.get_inference(id) # handles both V2 and V3 elif depl_type == DeploymentType.COMPUTE_V2: deployment = cclient.get_compute(id) - elif depl_type == DeploymentType.CSERVE_V2: - deployment = cclient.get_cserve(id) + elif depl_type in [DeploymentType.CSERVE_V2, DeploymentType.CSERVE_V3]: + deployment = cclient.get_cserve(id) # handles both V2 and V3 else: sys.exit("Please enter correct deployment type") @@ -150,21 +170,18 @@ def get(type, id): ) click.echo("Additional deployment configurations:") - if depl_type == DeploymentType.INFERENCE_V2: - click.echo( - tabulate( - [ - ("Image", deployment.image_url), - ("Container port", deployment.container_port), - ("Healthcheck", deployment.healthcheck or "/"), - ("Replicas", {"min": deployment.min_scale, "max": deployment.max_scale}), - ("Environment variables", deployment.env_vars or "None"), - ("Max concurrency", deployment.concurrency or "None"), - ], - tablefmt="rounded_outline", - disable_numparse=True, - ) - ) + if depl_type in [DeploymentType.INFERENCE_V2, DeploymentType.INFERENCE_V3]: + replica_info = _get_replica_info(deployment) + display_rows = [ + ("Image", deployment.image_url), + ("Container port", deployment.container_port), + ("Healthcheck", deployment.healthcheck or "/"), + ("Replicas", replica_info), + ("Environment variables", deployment.env_vars or "None"), + ("Max concurrency", deployment.concurrency or "None"), + ] + + click.echo(tabulate(display_rows, tablefmt="rounded_outline", disable_numparse=True)) elif depl_type == DeploymentType.COMPUTE_V2: click.echo( tabulate( @@ -173,25 +190,22 @@ def get(type, id): disable_numparse=True, ) ) - elif depl_type == DeploymentType.CSERVE_V2: - click.echo( - tabulate( - [ - ("Hugging face model", deployment.recipe.model), - ( - "Parallelism", - { - "tensor": deployment.recipe.additional_properties['tensor_parallel_size'], - "pipeline": deployment.recipe.additional_properties['pipeline_parallel_size'], - }, - ), - ("Replicas", {"min": deployment.min_scale, "max": deployment.max_scale}), - ("Max concurrency", deployment.concurrency or "None"), - ], - tablefmt="rounded_outline", - disable_numparse=True, - ) - ) + elif depl_type in [DeploymentType.CSERVE_V2, DeploymentType.CSERVE_V3]: + replica_info = _get_replica_info(deployment) + display_rows = [ + ("Hugging face model", deployment.recipe.model), + ( + "Parallelism", + { + "tensor": deployment.recipe.additional_properties.get("tensor_parallel_size", "N/A"), + "pipeline": deployment.recipe.additional_properties.get("pipeline_parallel_size", "N/A"), + }, + ), + ("Replicas", replica_info), + ("Max concurrency", deployment.concurrency or "None"), + ] + + click.echo(tabulate(display_rows, tablefmt="rounded_outline", disable_numparse=True)) @click.command(help="Delete a deployment") diff --git a/centml/sdk/api.py b/centml/sdk/api.py index da7d307..8e77548 100644 --- a/centml/sdk/api.py +++ b/centml/sdk/api.py @@ -4,9 +4,10 @@ from platform_api_python_client import ( DeploymentType, DeploymentStatus, - CreateInferenceDeploymentRequest, + CreateInferenceV3DeploymentRequest, CreateComputeDeploymentRequest, - CreateCServeV2DeploymentRequest, + CreateCServeV3DeploymentRequest, + ApiException, Metric, ) @@ -27,31 +28,59 @@ def get_status(self, id): return self._api.get_deployment_status_deployments_status_deployment_id_get(id) def get_inference(self, id): - return self._api.get_inference_deployment_deployments_inference_deployment_id_get(id) + """Get Inference deployment details - automatically handles both V2 and V3 deployments""" + # Try V3 first (recommended), fallback to V2 if deployment is V2 + try: + return self._api.get_inference_v3_deployment_deployments_inference_v3_deployment_id_get(id) + except ApiException as e: + # If V3 fails with 404 or similar, try V2 + if e.status in [404, 400]: # Deployment might be V2 or endpoint not found + try: + return self._api.get_inference_deployment_deployments_inference_deployment_id_get(id) + except ApiException as v2_error: + # If both fail, raise the original V3 error as it's more likely to be the real issue + raise e from v2_error + else: + # For other errors (auth, network, etc.), raise immediately + raise def get_compute(self, id): return self._api.get_compute_deployment_deployments_compute_deployment_id_get(id) def get_cserve(self, id): - return self._api.get_cserve_v2_deployment_deployments_cserve_v2_deployment_id_get(id) - - def create_inference(self, request: CreateInferenceDeploymentRequest): - return self._api.create_inference_deployment_deployments_inference_post(request) + """Get CServe deployment details - automatically handles both V2 and V3 deployments""" + # Try V3 first (recommended), fallback to V2 if deployment is V2 + try: + return self._api.get_cserve_v3_deployment_deployments_cserve_v3_deployment_id_get(id) + except ApiException as e: + # If V3 fails with 404 or similar, try V2 + if e.status in [404, 400]: # Deployment might be V2 or endpoint not found + try: + return self._api.get_cserve_v2_deployment_deployments_cserve_v2_deployment_id_get(id) + except ApiException as v2_error: + # If both fail, raise the original V3 error as it's more likely to be the real issue + raise e from v2_error + else: + # For other errors (auth, network, etc.), raise immediately + raise + + def create_inference(self, request: CreateInferenceV3DeploymentRequest): + return self._api.create_inference_v3_deployment_deployments_inference_v3_post(request) def create_compute(self, request: CreateComputeDeploymentRequest): return self._api.create_compute_deployment_deployments_compute_post(request) - def create_cserve(self, request: CreateCServeV2DeploymentRequest): - return self._api.create_cserve_v2_deployment_deployments_cserve_v2_post(request) + def create_cserve(self, request: CreateCServeV3DeploymentRequest): + return self._api.create_cserve_v3_deployment_deployments_cserve_v3_post(request) - def update_inference(self, deployment_id: int, request: CreateInferenceDeploymentRequest): - return self._api.update_inference_deployment_deployments_inference_put(deployment_id, request) + def update_inference(self, deployment_id: int, request: CreateInferenceV3DeploymentRequest): + return self._api.update_inference_v3_deployment_deployments_inference_v3_put(deployment_id, request) def update_compute(self, deployment_id: int, request: CreateComputeDeploymentRequest): return self._api.update_compute_deployment_deployments_compute_put(deployment_id, request) - def update_cserve(self, deployment_id: int, request: CreateCServeV2DeploymentRequest): - return self._api.update_cserve_v2_deployment_deployments_cserve_v2_put(deployment_id, request) + def update_cserve(self, deployment_id: int, request: CreateCServeV3DeploymentRequest): + return self._api.update_cserve_v3_deployment_deployments_cserve_v3_put(deployment_id, request) def _update_status(self, id, new_status): status_req = platform_api_python_client.DeploymentStatusRequest(status=new_status) diff --git a/examples/sdk/create_cserve.py b/examples/sdk/create_cserve.py index 54e0c9b..086fe4d 100644 --- a/examples/sdk/create_cserve.py +++ b/examples/sdk/create_cserve.py @@ -1,18 +1,18 @@ import centml from centml.sdk.api import get_centml_client -from centml.sdk import DeploymentType, CreateCServeV2DeploymentRequest, CServeV2Recipe +from centml.sdk import DeploymentType, CreateCServeV3DeploymentRequest, CServeV2Recipe def get_fastest_cserve_config(cclient, name, model): fastest = cclient.get_cserve_recipe(model=model)[0].fastest - return CreateCServeV2DeploymentRequest( + return CreateCServeV3DeploymentRequest( name=name, cluster_id=cclient.get_cluster_id(fastest.hardware_instance_id), hardware_instance_id=fastest.hardware_instance_id, recipe=fastest.recipe, - min_scale=1, - max_scale=1, + min_replicas=1, + max_replicas=1, env_vars={}, ) @@ -22,13 +22,13 @@ def get_default_cserve_config(cclient, name, model): hardware_instance = cclient.get_hardware_instances(cluster_id=1001)[0] - return CreateCServeV2DeploymentRequest( + return CreateCServeV3DeploymentRequest( name=name, cluster_id=hardware_instance.cluster_id, hardware_instance_id=hardware_instance.id, recipe=default_recipe, - min_scale=1, - max_scale=1, + min_replicas=1, + max_replicas=1, env_vars={}, ) @@ -36,27 +36,30 @@ def get_default_cserve_config(cclient, name, model): def main(): with get_centml_client() as cclient: ### Get the configurations for the Qwen model - qwen_config = get_fastest_cserve_config(cclient, name="qwen-fastest", model="Qwen/Qwen2-VL-7B-Instruct") - #qwen_config = get_default_cserve_config(cclient, name="qwen-default", model="Qwen/Qwen2-VL-7B-Instruct") + qwen_config = get_fastest_cserve_config( + cclient, name="qwen-fastest", model="Qwen/Qwen2-VL-7B-Instruct" + ) + # qwen_config = get_default_cserve_config(cclient, name="qwen-default", model="Qwen/Qwen2-VL-7B-Instruct") ### Modify the recipe if necessary qwen_config.recipe.additional_properties["max_num_seqs"] = 512 - # Create CServeV2 deployment + # Create CServeV3 deployment response = cclient.create_cserve(qwen_config) print("Create deployment response: ", response) ### Get deployment details - deployment = cclient.get_cserve(response.id) + deployment = cclient.get_cserve(response.id) # Automatically detects V2 print("Deployment details: ", deployment) - ''' + """ ### Pause the deployment cclient.pause(deployment.id) ### Delete the deployment cclient.delete(deployment.id) - ''' + """ + if __name__ == "__main__": main() diff --git a/examples/sdk/create_inference.py b/examples/sdk/create_inference.py index 8af4d20..5531c72 100644 --- a/examples/sdk/create_inference.py +++ b/examples/sdk/create_inference.py @@ -1,26 +1,32 @@ import centml from centml.sdk.api import get_centml_client -from centml.sdk import DeploymentType, CreateInferenceDeploymentRequest, UserVaultType +from centml.sdk import DeploymentType, CreateInferenceV3DeploymentRequest, UserVaultType def main(): with get_centml_client() as cclient: certs = cclient.get_user_vault(UserVaultType.CERTIFICATES) - request = CreateInferenceDeploymentRequest( + request = CreateInferenceV3DeploymentRequest( name="nginx", cluster_id=1000, hardware_instance_id=1000, image_url="nginxinc/nginx-unprivileged", port=8080, - min_scale=1, - max_scale=1, + min_replicas=1, # V3 uses min_replicas instead of min_scale + max_replicas=3, # V3 uses max_replicas instead of max_scale + initial_replicas=1, # Optional in V3 - initial number of replicas endpoint_certificate_authority=certs["my_cert"], + # V3 rollout strategy parameters + max_surge=1, # Allow 1 extra pod during updates + max_unavailable=0, # Keep all pods available during updates + healthcheck="/", + concurrency=10, ) response = cclient.create_inference(request) print("Create deployment response: ", response) - ### Get deployment details + ### Get deployment details (automatically detects V2 or V3) deployment = cclient.get_inference(response.id) print("Deployment details: ", deployment) diff --git a/requirements.txt b/requirements.txt index f022c63..1fe82b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ cryptography==44.0.1 prometheus-client>=0.20.0 scipy>=1.6.0 scikit-learn>=1.5.1 -platform-api-python-client==4.0.12 +platform-api-python-client==4.1.9