Skip to content

Commit

Permalink
feat(components) Extend kserve component (kubeflow#10136)
Browse files Browse the repository at this point in the history
* add runtime version, resource requests and resource limits

* adjust kservedeployer

* Update components/kserve/src/kservedeployer.py

Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com>

* Update components/kserve/src/kservedeployer.py

Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com>

* Update components/kserve/src/kservedeployer.py

Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com>

---------

Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com>
  • Loading branch information
2 people authored and stijntratsaertit committed Feb 16, 2024
1 parent da64789 commit 518affe
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 27 deletions.
4 changes: 3 additions & 1 deletion components/kserve/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ kserve_op = components.load_component_from_url('https://raw.githubusercontent.co
| canary_traffic_percent | `100` | The traffic split percentage between the candidate model and the last ready model |
| namespace | | Kubernetes namespace where the KServe service is deployed. If no namespace is provided, `anonymous` will be used unless a namespace is provided in the `inferenceservice_yaml` argument. |
| framework | | Machine learning framework for model serving. Currently the supported frameworks are `tensorflow`, `pytorch`, `sklearn`, `xgboost`, `onnx`, `triton`, `pmml`, and `lightgbm`. |
| runtime_version | `latest` | Runtime Version of Machine Learning Framework |
| resource_requests | `{"cpu": "0.5", "memory": "512Mi"}` | CPU and Memory requests for Model Serving |
| resource_limits | `{"cpu": "1", "memory": "1Gi"}` | CPU and Memory limits for Model Serving |
| custom_model_spec | `{}` | Custom model runtime container spec in JSON. Sample spec: `{"image": "codait/max-object-detector", "port":5000, "name": "test-container"}` |
| inferenceservice_yaml | `{}` | Raw InferenceService serialized YAML for deployment. Use this if you need additional configurations for your InferenceService. |
| autoscaling_target | `0` | Autoscaling Target Number. If not 0, sets the following annotation on the InferenceService: `autoscaling.knative.dev/target` |
Expand Down Expand Up @@ -185,4 +188,3 @@ kserve_op(
inferenceservice_yaml=isvc_yaml
)
```

40 changes: 23 additions & 17 deletions components/kserve/component.yaml
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
name: Serve a model with KServe
description: Serve Models using KServe
inputs:
- {name: Action, type: String, default: 'create', description: 'Action to execute on KServe'}
- {name: Model Name, type: String, default: '', description: 'Name to give to the deployed model'}
- {name: Model URI, type: String, default: '', description: 'Path of the S3 or GCS compatible directory containing the model.'}
- {name: Canary Traffic Percent, type: String, default: '100', description: 'The traffic split percentage between the candidate model and the last ready model'}
- {name: Namespace, type: String, default: '', description: 'Kubernetes namespace where the KServe service is deployed.'}
- {name: Framework, type: String, default: '', description: 'Machine Learning Framework for Model Serving.'}
- {name: Custom Model Spec, type: String, default: '{}', description: 'Custom model runtime container spec in JSON'}
- {name: Autoscaling Target, type: String, default: '0', description: 'Autoscaling Target Number'}
- {name: Service Account, type: String, default: '', description: 'ServiceAccount to use to run the InferenceService pod'}
- {name: Enable Istio Sidecar, type: Bool, default: 'True', description: 'Whether to enable istio sidecar injection'}
- {name: InferenceService YAML, type: String, default: '{}', description: 'Raw InferenceService serialized YAML for deployment'}
- {name: Watch Timeout, type: String, default: '300', description: "Timeout seconds for watching until InferenceService becomes ready."}
- {name: Min Replicas, type: String, default: '-1', description: 'Minimum number of InferenceService replicas'}
- {name: Max Replicas, type: String, default: '-1', description: 'Maximum number of InferenceService replicas'}
- {name: Request Timeout, type: String, default: '60', description: "Specifies the number of seconds to wait before timing out a request to the component."}
- {name: Enable ISVC Status, type: Bool, default: 'True', description: "Specifies whether to store the inference service status as the output parameter"}
- {name: Action, type: String, default: 'create', description: 'Action to execute on KServe'}
- {name: Model Name, type: String, default: '', description: 'Name to give to the deployed model'}
- {name: Model URI, type: String, default: '', description: 'Path of the S3 or GCS compatible directory containing the model.'}
- {name: Canary Traffic Percent, type: String, default: '100', description: 'The traffic split percentage between the candidate model and the last ready model'}
- {name: Namespace, type: String, default: '', description: 'Kubernetes namespace where the KServe service is deployed.'}
- {name: Framework, type: String, default: '', description: 'Machine Learning Framework for Model Serving.'}
- {name: Runtime Version, type: String, default: 'latest', description: 'Runtime Version of Machine Learning Framework'}
- {name: Resource Requests, type: String, default: '{"cpu": "0.5", "memory": "512Mi"}', description: 'CPU and Memory requests for Model Serving'}
- {name: Resource Limits, type: String, default: '{"cpu": "1", "memory": "1Gi"}', description: 'CPU and Memory limits for Model Serving'}
- {name: Custom Model Spec, type: String, default: '{}', description: 'Custom model runtime container spec in JSON'}
- {name: Autoscaling Target, type: String, default: '0', description: 'Autoscaling Target Number'}
- {name: Service Account, type: String, default: '', description: 'ServiceAccount to use to run the InferenceService pod'}
- {name: Enable Istio Sidecar, type: Bool, default: 'True', description: 'Whether to enable istio sidecar injection'}
- {name: InferenceService YAML, type: String, default: '{}', description: 'Raw InferenceService serialized YAML for deployment'}
- {name: Watch Timeout, type: String, default: '300', description: "Timeout seconds for watching until InferenceService becomes ready."}
- {name: Min Replicas, type: String, default: '-1', description: 'Minimum number of InferenceService replicas'}
- {name: Max Replicas, type: String, default: '-1', description: 'Maximum number of InferenceService replicas'}
- {name: Request Timeout, type: String, default: '60', description: "Specifies the number of seconds to wait before timing out a request to the component."}
- {name: Enable ISVC Status, type: Bool, default: 'True', description: "Specifies whether to store the inference service status as the output parameter"}

outputs:
- {name: InferenceService Status, type: String, description: 'Status JSON output of InferenceService'}
- {name: InferenceService Status, type: String, description: 'Status JSON output of InferenceService'}
implementation:
container:
image: quay.io/aipipeline/kserve-component:v0.11.1
Expand All @@ -32,6 +35,9 @@ implementation:
--canary-traffic-percent, {inputValue: Canary Traffic Percent},
--namespace, {inputValue: Namespace},
--framework, {inputValue: Framework},
--runtime-version, {inputValue: Runtime Version},
--resource-requests, {inputValue: Resource Requests},
--resource-limits, {inputValue: Resource Limits},
--custom-model-spec, {inputValue: Custom Model Spec},
--autoscaling-target, {inputValue: Autoscaling Target},
--service-account, {inputValue: Service Account},
Expand Down
52 changes: 43 additions & 9 deletions components/kserve/src/kservedeployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import yaml

from kubernetes import client
from kubernetes.client.models import V1ResourceRequirements

from kserve import constants
from kserve import KServeClient
Expand Down Expand Up @@ -50,8 +51,9 @@
}


def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
service_account, min_replicas, max_replicas, containers, request_timeout):
def create_predictor_spec(framework, runtime_version, resource_requests, resource_limits,
storage_uri, canary_traffic_percent, service_account, min_replicas,
max_replicas, containers, request_timeout):
"""
Create and return V1beta1PredictorSpec to be used in a V1beta1InferenceServiceSpec
object.
Expand Down Expand Up @@ -81,7 +83,14 @@ def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
setattr(
predictor_spec,
framework,
AVAILABLE_FRAMEWORKS[framework](storage_uri=storage_uri)
AVAILABLE_FRAMEWORKS[framework](
storage_uri=storage_uri,
resources=V1ResourceRequirements(
requests=resource_requests,
limits=resource_limits
),
runtime_version=runtime_version
)
)
return predictor_spec

Expand Down Expand Up @@ -178,10 +187,10 @@ def submit_api_request(kserve_client, action, name, isvc, namespace=None,
return outputs


def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace,
framework, custom_model_spec, service_account, inferenceservice_yaml,
request_timeout, autoscaling_target=0, enable_istio_sidecar=True,
watch_timeout=300, min_replicas=0, max_replicas=0):
def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace, framework,
runtime_version, resource_requests, resource_limits, custom_model_spec,
service_account, inferenceservice_yaml, request_timeout, autoscaling_target=0,
enable_istio_sidecar=True, watch_timeout=300, min_replicas=0, max_replicas=0):
"""
Perform the specified action. If the action is not 'delete' and `inferenceService_yaml`
was provided, the dict representation of the YAML will be sent directly to the
Expand Down Expand Up @@ -224,8 +233,9 @@ def perform_action(action, model_name, model_uri, canary_traffic_percent, namesp

# Build the V1beta1PredictorSpec.
predictor_spec = create_predictor_spec(
framework, model_uri, canary_traffic_percent, service_account,
min_replicas, max_replicas, containers, request_timeout
framework, runtime_version, resource_requests, resource_limits,
model_uri, canary_traffic_percent, service_account, min_replicas,
max_replicas, containers, request_timeout
)

isvc = create_inference_service(metadata, predictor_spec)
Expand Down Expand Up @@ -287,6 +297,24 @@ def main():
str(list(AVAILABLE_FRAMEWORKS.keys())),
default=""
)
parser.add_argument(
"--runtime-version",
type=str,
help="Runtime Version of Machine Learning Framework",
default="latest"
)
parser.add_argument(
"--resource-requests",
type=json.loads,
help="CPU and Memory requests for Model Serving",
default='{"cpu": "0.5", "memory": "512Mi"}',
)
parser.add_argument(
"--resource-limits",
type=json.loads,
help="CPU and Memory limits for Model Serving",
default='{"cpu": "1", "memory": "1Gi"}',
)
parser.add_argument(
"--custom-model-spec",
type=json.loads,
Expand Down Expand Up @@ -342,6 +370,9 @@ def main():
canary_traffic_percent = int(args.canary_traffic_percent)
namespace = args.namespace
framework = args.framework.lower()
runtime_version = args.runtime_version.lower()
resource_requests = args.resource_requests
resource_limits = args.resource_limits
output_path = args.output_path
custom_model_spec = args.custom_model_spec
autoscaling_target = int(args.autoscaling_target)
Expand Down Expand Up @@ -381,6 +412,9 @@ def main():
canary_traffic_percent=canary_traffic_percent,
namespace=namespace,
framework=framework,
runtime_version=runtime_version,
resource_requests=resource_requests,
resource_limits=resource_limits,
custom_model_spec=custom_model_spec,
autoscaling_target=autoscaling_target,
service_account=service_account,
Expand Down

0 comments on commit 518affe

Please sign in to comment.