Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(components) Extend kserve component #10136

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion components/kserve/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ kserve_op = components.load_component_from_url('https://raw.githubusercontent.co
| canary_traffic_percent | `100` | The traffic split percentage between the candidate model and the last ready model |
| namespace | | Kubernetes namespace where the KServe service is deployed. If no namespace is provided, `anonymous` will be used unless a namespace is provided in the `inferenceservice_yaml` argument. |
| framework | | Machine learning framework for model serving. Currently the supported frameworks are `tensorflow`, `pytorch`, `sklearn`, `xgboost`, `onnx`, `triton`, `pmml`, and `lightgbm`. |
| runtime_version | `latest` | Runtime Version of Machine Learning Framework |
| resource_requests | `{"cpu": "0.5", "memory": "512Mi"}` | CPU and Memory requests for Model Serving |
| resource_limits | `{"cpu": "1", "memory": "1Gi"}` | CPU and Memory limits for Model Serving |
| custom_model_spec | `{}` | Custom model runtime container spec in JSON. Sample spec: `{"image": "codait/max-object-detector", "port":5000, "name": "test-container"}` |
| inferenceservice_yaml | `{}` | Raw InferenceService serialized YAML for deployment. Use this if you need additional configurations for your InferenceService. |
| autoscaling_target | `0` | Autoscaling Target Number. If not 0, sets the following annotation on the InferenceService: `autoscaling.knative.dev/target` |
Expand Down Expand Up @@ -185,4 +188,3 @@ kserve_op(
inferenceservice_yaml=isvc_yaml
)
```

40 changes: 23 additions & 17 deletions components/kserve/component.yaml
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
name: Serve a model with KServe
description: Serve Models using KServe
inputs:
- {name: Action, type: String, default: 'create', description: 'Action to execute on KServe'}
- {name: Model Name, type: String, default: '', description: 'Name to give to the deployed model'}
- {name: Model URI, type: String, default: '', description: 'Path of the S3 or GCS compatible directory containing the model.'}
- {name: Canary Traffic Percent, type: String, default: '100', description: 'The traffic split percentage between the candidate model and the last ready model'}
- {name: Namespace, type: String, default: '', description: 'Kubernetes namespace where the KServe service is deployed.'}
- {name: Framework, type: String, default: '', description: 'Machine Learning Framework for Model Serving.'}
- {name: Custom Model Spec, type: String, default: '{}', description: 'Custom model runtime container spec in JSON'}
- {name: Autoscaling Target, type: String, default: '0', description: 'Autoscaling Target Number'}
- {name: Service Account, type: String, default: '', description: 'ServiceAccount to use to run the InferenceService pod'}
- {name: Enable Istio Sidecar, type: Bool, default: 'True', description: 'Whether to enable istio sidecar injection'}
- {name: InferenceService YAML, type: String, default: '{}', description: 'Raw InferenceService serialized YAML for deployment'}
- {name: Watch Timeout, type: String, default: '300', description: "Timeout seconds for watching until InferenceService becomes ready."}
- {name: Min Replicas, type: String, default: '-1', description: 'Minimum number of InferenceService replicas'}
- {name: Max Replicas, type: String, default: '-1', description: 'Maximum number of InferenceService replicas'}
- {name: Request Timeout, type: String, default: '60', description: "Specifies the number of seconds to wait before timing out a request to the component."}
- {name: Enable ISVC Status, type: Bool, default: 'True', description: "Specifies whether to store the inference service status as the output parameter"}
- {name: Action, type: String, default: 'create', description: 'Action to execute on KServe'}
- {name: Model Name, type: String, default: '', description: 'Name to give to the deployed model'}
- {name: Model URI, type: String, default: '', description: 'Path of the S3 or GCS compatible directory containing the model.'}
- {name: Canary Traffic Percent, type: String, default: '100', description: 'The traffic split percentage between the candidate model and the last ready model'}
- {name: Namespace, type: String, default: '', description: 'Kubernetes namespace where the KServe service is deployed.'}
- {name: Framework, type: String, default: '', description: 'Machine Learning Framework for Model Serving.'}
- {name: Runtime Version, type: String, default: 'latest', description: 'Runtime Version of Machine Learning Framework'}
- {name: Resource Requests, type: String, default: '{"cpu": "0.5", "memory": "512Mi"}', description: 'CPU and Memory requests for Model Serving'}
- {name: Resource Limits, type: String, default: '{"cpu": "1", "memory": "1Gi"}', description: 'CPU and Memory limits for Model Serving'}
- {name: Custom Model Spec, type: String, default: '{}', description: 'Custom model runtime container spec in JSON'}
- {name: Autoscaling Target, type: String, default: '0', description: 'Autoscaling Target Number'}
- {name: Service Account, type: String, default: '', description: 'ServiceAccount to use to run the InferenceService pod'}
- {name: Enable Istio Sidecar, type: Bool, default: 'True', description: 'Whether to enable istio sidecar injection'}
- {name: InferenceService YAML, type: String, default: '{}', description: 'Raw InferenceService serialized YAML for deployment'}
- {name: Watch Timeout, type: String, default: '300', description: "Timeout seconds for watching until InferenceService becomes ready."}
- {name: Min Replicas, type: String, default: '-1', description: 'Minimum number of InferenceService replicas'}
- {name: Max Replicas, type: String, default: '-1', description: 'Maximum number of InferenceService replicas'}
- {name: Request Timeout, type: String, default: '60', description: "Specifies the number of seconds to wait before timing out a request to the component."}
- {name: Enable ISVC Status, type: Bool, default: 'True', description: "Specifies whether to store the inference service status as the output parameter"}

outputs:
- {name: InferenceService Status, type: String, description: 'Status JSON output of InferenceService'}
- {name: InferenceService Status, type: String, description: 'Status JSON output of InferenceService'}
implementation:
container:
image: quay.io/aipipeline/kserve-component:v0.11.1
Expand All @@ -32,6 +35,9 @@ implementation:
--canary-traffic-percent, {inputValue: Canary Traffic Percent},
--namespace, {inputValue: Namespace},
--framework, {inputValue: Framework},
--runtime-version, {inputValue: Runtime Version},
--resource-requests, {inputValue: Resource Requests},
--resource-limits, {inputValue: Resource Limits},
--custom-model-spec, {inputValue: Custom Model Spec},
--autoscaling-target, {inputValue: Autoscaling Target},
--service-account, {inputValue: Service Account},
Expand Down
52 changes: 43 additions & 9 deletions components/kserve/src/kservedeployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import yaml

from kubernetes import client
from kubernetes.client.models import V1ResourceRequirements

from kserve import constants
from kserve import KServeClient
Expand Down Expand Up @@ -50,8 +51,9 @@
}


def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
service_account, min_replicas, max_replicas, containers, request_timeout):
def create_predictor_spec(framework, runtime_version, resource_requests, resource_limits,
storage_uri, canary_traffic_percent, service_account, min_replicas,
max_replicas, containers, request_timeout):
"""
Create and return V1beta1PredictorSpec to be used in a V1beta1InferenceServiceSpec
object.
Expand Down Expand Up @@ -81,7 +83,14 @@ def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
setattr(
predictor_spec,
framework,
AVAILABLE_FRAMEWORKS[framework](storage_uri=storage_uri)
AVAILABLE_FRAMEWORKS[framework](
storage_uri=storage_uri,
resources=V1ResourceRequirements(
requests=resource_requests,
limits=resource_limits
),
runtime_version=runtime_version
)
)
return predictor_spec

Expand Down Expand Up @@ -178,10 +187,10 @@ def submit_api_request(kserve_client, action, name, isvc, namespace=None,
return outputs


def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace,
framework, custom_model_spec, service_account, inferenceservice_yaml,
request_timeout, autoscaling_target=0, enable_istio_sidecar=True,
watch_timeout=300, min_replicas=0, max_replicas=0):
def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace, framework,
runtime_version, resource_requests, resource_limits, custom_model_spec,
service_account, inferenceservice_yaml, request_timeout, autoscaling_target=0,
enable_istio_sidecar=True, watch_timeout=300, min_replicas=0, max_replicas=0):
"""
Perform the specified action. If the action is not 'delete' and `inferenceService_yaml`
was provided, the dict representation of the YAML will be sent directly to the
Expand Down Expand Up @@ -224,8 +233,9 @@ def perform_action(action, model_name, model_uri, canary_traffic_percent, namesp

# Build the V1beta1PredictorSpec.
predictor_spec = create_predictor_spec(
framework, model_uri, canary_traffic_percent, service_account,
min_replicas, max_replicas, containers, request_timeout
framework, runtime_version, resource_requests, resource_limits,
model_uri, canary_traffic_percent, service_account, min_replicas,
max_replicas, containers, request_timeout
)

isvc = create_inference_service(metadata, predictor_spec)
Expand Down Expand Up @@ -287,6 +297,24 @@ def main():
str(list(AVAILABLE_FRAMEWORKS.keys())),
default=""
)
parser.add_argument(
"--runtime-version",
type=str,
help="Runtime Version of Machine Learning Framework",
default="latest"
)
parser.add_argument(
"--resource-requests",
type=json.loads,
help="CPU and Memory requests for Model Serving",
default='{"cpu": "0.5", "memory": "512Mi"}',
)
parser.add_argument(
"--resource-limits",
type=json.loads,
help="CPU and Memory limits for Model Serving",
default='{"cpu": "1", "memory": "1Gi"}',
)
parser.add_argument(
"--custom-model-spec",
type=json.loads,
Expand Down Expand Up @@ -342,6 +370,9 @@ def main():
canary_traffic_percent = int(args.canary_traffic_percent)
namespace = args.namespace
framework = args.framework.lower()
runtime_version = args.runtime_version.lower()
resource_requests = args.resource_requests
resource_limits = args.resource_limits
output_path = args.output_path
custom_model_spec = args.custom_model_spec
autoscaling_target = int(args.autoscaling_target)
Expand Down Expand Up @@ -381,6 +412,9 @@ def main():
canary_traffic_percent=canary_traffic_percent,
namespace=namespace,
framework=framework,
runtime_version=runtime_version,
resource_requests=resource_requests,
resource_limits=resource_limits,
custom_model_spec=custom_model_spec,
autoscaling_target=autoscaling_target,
service_account=service_account,
Expand Down