Skip to content
114 changes: 64 additions & 50 deletions centml/cli/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,26 @@
from centml.sdk.api import get_centml_client


# convert deployment type enum to a user friendly name
depl_type_to_name_map = {
DeploymentType.INFERENCE: 'inference',
DeploymentType.COMPUTE: 'compute',
DeploymentType.COMPILATION: 'compilation',
DeploymentType.INFERENCE_V2: 'inference',
DeploymentType.COMPUTE_V2: 'compute',
DeploymentType.CSERVE: 'cserve',
DeploymentType.CSERVE_V2: 'cserve',
DeploymentType.RAG: 'rag',
DeploymentType.INFERENCE: "inference",
DeploymentType.COMPUTE: "compute",
DeploymentType.COMPILATION: "compilation",
DeploymentType.INFERENCE_V2: "inference",
DeploymentType.INFERENCE_V3: "inference",
DeploymentType.COMPUTE_V2: "compute",
# For user, they are all cserve.
DeploymentType.CSERVE: "cserve",
DeploymentType.CSERVE_V2: "cserve",
DeploymentType.CSERVE_V3: "cserve",
DeploymentType.RAG: "rag",
}
# use latest type to for user requests
depl_name_to_type_map = {
'inference': DeploymentType.INFERENCE_V2,
'cserve': DeploymentType.CSERVE_V2,
'compute': DeploymentType.COMPUTE_V2,
'rag': DeploymentType.RAG,
"inference": DeploymentType.INFERENCE_V3,
"cserve": DeploymentType.CSERVE_V3,
"compute": DeploymentType.COMPUTE_V2,
"rag": DeploymentType.RAG,
}


Expand Down Expand Up @@ -56,6 +61,21 @@ def _format_ssh_key(ssh_key):
return ssh_key[:32] + "..."


def _get_replica_info(deployment):
"""Extract replica information handling V2/V3 field differences"""
# Check actual deployment object fields rather than depl_type
# since unified get_cserve() can return either V2 or V3 objects
if hasattr(deployment, 'min_replicas'):
# V3 deployment response object
return {"min": deployment.min_replicas, "max": deployment.max_replicas}
elif hasattr(deployment, 'min_scale'):
# V2 deployment response object
return {"min": deployment.min_scale, "max": deployment.max_scale}
else:
# Fallback - shouldn't happen
return {"min": "N/A", "max": "N/A"}


def _get_ready_status(cclient, deployment):
api_status = deployment.status
service_status = (
Expand Down Expand Up @@ -121,12 +141,12 @@ def get(type, id):
with get_centml_client() as cclient:
depl_type = depl_name_to_type_map[type]

if depl_type == DeploymentType.INFERENCE_V2:
deployment = cclient.get_inference(id)
if depl_type in [DeploymentType.INFERENCE_V2, DeploymentType.INFERENCE_V3]:
deployment = cclient.get_inference(id) # handles both V2 and V3
elif depl_type == DeploymentType.COMPUTE_V2:
deployment = cclient.get_compute(id)
elif depl_type == DeploymentType.CSERVE_V2:
deployment = cclient.get_cserve(id)
elif depl_type in [DeploymentType.CSERVE_V2, DeploymentType.CSERVE_V3]:
deployment = cclient.get_cserve(id) # handles both V2 and V3
else:
sys.exit("Please enter correct deployment type")

Expand All @@ -150,21 +170,18 @@ def get(type, id):
)

click.echo("Additional deployment configurations:")
if depl_type == DeploymentType.INFERENCE_V2:
click.echo(
tabulate(
[
("Image", deployment.image_url),
("Container port", deployment.container_port),
("Healthcheck", deployment.healthcheck or "/"),
("Replicas", {"min": deployment.min_scale, "max": deployment.max_scale}),
("Environment variables", deployment.env_vars or "None"),
("Max concurrency", deployment.concurrency or "None"),
],
tablefmt="rounded_outline",
disable_numparse=True,
)
)
if depl_type in [DeploymentType.INFERENCE_V2, DeploymentType.INFERENCE_V3]:
replica_info = _get_replica_info(deployment)
display_rows = [
("Image", deployment.image_url),
("Container port", deployment.container_port),
("Healthcheck", deployment.healthcheck or "/"),
("Replicas", replica_info),
("Environment variables", deployment.env_vars or "None"),
("Max concurrency", deployment.concurrency or "None"),
]

click.echo(tabulate(display_rows, tablefmt="rounded_outline", disable_numparse=True))
elif depl_type == DeploymentType.COMPUTE_V2:
click.echo(
tabulate(
Expand All @@ -173,25 +190,22 @@ def get(type, id):
disable_numparse=True,
)
)
elif depl_type == DeploymentType.CSERVE_V2:
click.echo(
tabulate(
[
("Hugging face model", deployment.recipe.model),
(
"Parallelism",
{
"tensor": deployment.recipe.additional_properties['tensor_parallel_size'],
"pipeline": deployment.recipe.additional_properties['pipeline_parallel_size'],
},
),
("Replicas", {"min": deployment.min_scale, "max": deployment.max_scale}),
("Max concurrency", deployment.concurrency or "None"),
],
tablefmt="rounded_outline",
disable_numparse=True,
)
)
elif depl_type in [DeploymentType.CSERVE_V2, DeploymentType.CSERVE_V3]:
replica_info = _get_replica_info(deployment)
display_rows = [
("Hugging face model", deployment.recipe.model),
(
"Parallelism",
{
"tensor": deployment.recipe.additional_properties.get("tensor_parallel_size", "N/A"),
"pipeline": deployment.recipe.additional_properties.get("pipeline_parallel_size", "N/A"),
},
),
("Replicas", replica_info),
("Max concurrency", deployment.concurrency or "None"),
]

click.echo(tabulate(display_rows, tablefmt="rounded_outline", disable_numparse=True))


@click.command(help="Delete a deployment")
Expand Down
55 changes: 42 additions & 13 deletions centml/sdk/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from platform_api_python_client import (
DeploymentType,
DeploymentStatus,
CreateInferenceDeploymentRequest,
CreateInferenceV3DeploymentRequest,
CreateComputeDeploymentRequest,
CreateCServeV2DeploymentRequest,
CreateCServeV3DeploymentRequest,
ApiException,
Metric,
)

Expand All @@ -27,31 +28,59 @@ def get_status(self, id):
return self._api.get_deployment_status_deployments_status_deployment_id_get(id)

def get_inference(self, id):
return self._api.get_inference_deployment_deployments_inference_deployment_id_get(id)
"""Get Inference deployment details - automatically handles both V2 and V3 deployments"""
# Try V3 first (recommended), fallback to V2 if deployment is V2
try:
return self._api.get_inference_v3_deployment_deployments_inference_v3_deployment_id_get(id)
except ApiException as e:
# If V3 fails with 404 or similar, try V2
if e.status in [404, 400]: # Deployment might be V2 or endpoint not found
try:
return self._api.get_inference_deployment_deployments_inference_deployment_id_get(id)
except ApiException as v2_error:
# If both fail, raise the original V3 error as it's more likely to be the real issue
raise e from v2_error
else:
# For other errors (auth, network, etc.), raise immediately
raise

def get_compute(self, id):
return self._api.get_compute_deployment_deployments_compute_deployment_id_get(id)

def get_cserve(self, id):
return self._api.get_cserve_v2_deployment_deployments_cserve_v2_deployment_id_get(id)

def create_inference(self, request: CreateInferenceDeploymentRequest):
return self._api.create_inference_deployment_deployments_inference_post(request)
"""Get CServe deployment details - automatically handles both V2 and V3 deployments"""
# Try V3 first (recommended), fallback to V2 if deployment is V2
try:
return self._api.get_cserve_v3_deployment_deployments_cserve_v3_deployment_id_get(id)
except ApiException as e:
# If V3 fails with 404 or similar, try V2
if e.status in [404, 400]: # Deployment might be V2 or endpoint not found
try:
return self._api.get_cserve_v2_deployment_deployments_cserve_v2_deployment_id_get(id)
except ApiException as v2_error:
# If both fail, raise the original V3 error as it's more likely to be the real issue
raise e from v2_error
else:
# For other errors (auth, network, etc.), raise immediately
raise

def create_inference(self, request: CreateInferenceV3DeploymentRequest):
return self._api.create_inference_v3_deployment_deployments_inference_v3_post(request)

def create_compute(self, request: CreateComputeDeploymentRequest):
return self._api.create_compute_deployment_deployments_compute_post(request)

def create_cserve(self, request: CreateCServeV2DeploymentRequest):
return self._api.create_cserve_v2_deployment_deployments_cserve_v2_post(request)
def create_cserve(self, request: CreateCServeV3DeploymentRequest):
return self._api.create_cserve_v3_deployment_deployments_cserve_v3_post(request)

def update_inference(self, deployment_id: int, request: CreateInferenceDeploymentRequest):
return self._api.update_inference_deployment_deployments_inference_put(deployment_id, request)
def update_inference(self, deployment_id: int, request: CreateInferenceV3DeploymentRequest):
return self._api.update_inference_v3_deployment_deployments_inference_v3_put(deployment_id, request)

def update_compute(self, deployment_id: int, request: CreateComputeDeploymentRequest):
return self._api.update_compute_deployment_deployments_compute_put(deployment_id, request)

def update_cserve(self, deployment_id: int, request: CreateCServeV2DeploymentRequest):
return self._api.update_cserve_v2_deployment_deployments_cserve_v2_put(deployment_id, request)
def update_cserve(self, deployment_id: int, request: CreateCServeV3DeploymentRequest):
return self._api.update_cserve_v3_deployment_deployments_cserve_v3_put(deployment_id, request)

def _update_status(self, id, new_status):
status_req = platform_api_python_client.DeploymentStatusRequest(status=new_status)
Expand Down
29 changes: 16 additions & 13 deletions examples/sdk/create_cserve.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import centml
from centml.sdk.api import get_centml_client
from centml.sdk import DeploymentType, CreateCServeV2DeploymentRequest, CServeV2Recipe
from centml.sdk import DeploymentType, CreateCServeV3DeploymentRequest, CServeV2Recipe


def get_fastest_cserve_config(cclient, name, model):
fastest = cclient.get_cserve_recipe(model=model)[0].fastest

return CreateCServeV2DeploymentRequest(
return CreateCServeV3DeploymentRequest(
name=name,
cluster_id=cclient.get_cluster_id(fastest.hardware_instance_id),
hardware_instance_id=fastest.hardware_instance_id,
recipe=fastest.recipe,
min_scale=1,
max_scale=1,
min_replicas=1,
max_replicas=1,
env_vars={},
)

Expand All @@ -22,41 +22,44 @@ def get_default_cserve_config(cclient, name, model):

hardware_instance = cclient.get_hardware_instances(cluster_id=1001)[0]

return CreateCServeV2DeploymentRequest(
return CreateCServeV3DeploymentRequest(
name=name,
cluster_id=hardware_instance.cluster_id,
hardware_instance_id=hardware_instance.id,
recipe=default_recipe,
min_scale=1,
max_scale=1,
min_replicas=1,
max_replicas=1,
env_vars={},
)


def main():
with get_centml_client() as cclient:
### Get the configurations for the Qwen model
qwen_config = get_fastest_cserve_config(cclient, name="qwen-fastest", model="Qwen/Qwen2-VL-7B-Instruct")
#qwen_config = get_default_cserve_config(cclient, name="qwen-default", model="Qwen/Qwen2-VL-7B-Instruct")
qwen_config = get_fastest_cserve_config(
cclient, name="qwen-fastest", model="Qwen/Qwen2-VL-7B-Instruct"
)
# qwen_config = get_default_cserve_config(cclient, name="qwen-default", model="Qwen/Qwen2-VL-7B-Instruct")

### Modify the recipe if necessary
qwen_config.recipe.additional_properties["max_num_seqs"] = 512

# Create CServeV2 deployment
# Create CServeV3 deployment
response = cclient.create_cserve(qwen_config)
print("Create deployment response: ", response)

### Get deployment details
deployment = cclient.get_cserve(response.id)
deployment = cclient.get_cserve(response.id) # Automatically detects V2
print("Deployment details: ", deployment)

'''
"""
### Pause the deployment
cclient.pause(deployment.id)

### Delete the deployment
cclient.delete(deployment.id)
'''
"""


if __name__ == "__main__":
main()
16 changes: 11 additions & 5 deletions examples/sdk/create_inference.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
import centml
from centml.sdk.api import get_centml_client
from centml.sdk import DeploymentType, CreateInferenceDeploymentRequest, UserVaultType
from centml.sdk import DeploymentType, CreateInferenceV3DeploymentRequest, UserVaultType


def main():
with get_centml_client() as cclient:
certs = cclient.get_user_vault(UserVaultType.CERTIFICATES)

request = CreateInferenceDeploymentRequest(
request = CreateInferenceV3DeploymentRequest(
name="nginx",
cluster_id=1000,
hardware_instance_id=1000,
image_url="nginxinc/nginx-unprivileged",
port=8080,
min_scale=1,
max_scale=1,
min_replicas=1, # V3 uses min_replicas instead of min_scale
max_replicas=3, # V3 uses max_replicas instead of max_scale
initial_replicas=1, # Optional in V3 - initial number of replicas
endpoint_certificate_authority=certs["my_cert"],
# V3 rollout strategy parameters
max_surge=1, # Allow 1 extra pod during updates
max_unavailable=0, # Keep all pods available during updates
healthcheck="/",
concurrency=10,
)
response = cclient.create_inference(request)
print("Create deployment response: ", response)

### Get deployment details
### Get deployment details (automatically detects V2 or V3)
deployment = cclient.get_inference(response.id)
print("Deployment details: ", deployment)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ cryptography==44.0.1
prometheus-client>=0.20.0
scipy>=1.6.0
scikit-learn>=1.5.1
platform-api-python-client==4.0.12
platform-api-python-client==4.1.9
Loading