feat(components) Extend kserve component (kubeflow#10136)

* add runtime version, resource requests and resource limits * adjust kservedeployer * Update components/kserve/src/kservedeployer.py Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com> * Update components/kserve/src/kservedeployer.py Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com> * Update components/kserve/src/kservedeployer.py Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com> --------- Co-authored-by: Tommy Li <Tommy.chaoping.li@ibm.com>
stijntratsaertit · Feb 16, 2024 · 518affe · 518affe
1 parent da64789
commit 518affe
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 27 deletions.
diff --git a/components/kserve/README.md b/components/kserve/README.md
@@ -39,6 +39,9 @@ kserve_op = components.load_component_from_url('https://raw.githubusercontent.co
 | canary_traffic_percent | `100` | The traffic split percentage between the candidate model and the last ready model |
 | namespace |  | Kubernetes namespace where the KServe service is deployed. If no namespace is provided, `anonymous` will be used unless a namespace is provided in the `inferenceservice_yaml` argument. |
 | framework |  | Machine learning framework for model serving. Currently the supported frameworks are  `tensorflow`, `pytorch`, `sklearn`, `xgboost`, `onnx`, `triton`, `pmml`, and `lightgbm`. |
+| runtime_version | `latest` | Runtime Version of Machine Learning Framework |
+| resource_requests | `{"cpu": "0.5", "memory": "512Mi"}` | CPU and Memory requests for Model Serving | 
+| resource_limits | `{"cpu": "1", "memory": "1Gi"}` | CPU and Memory limits for Model Serving | 
 | custom_model_spec | `{}` | Custom model runtime container spec in JSON. Sample spec: `{"image": "codait/max-object-detector", "port":5000, "name": "test-container"}` |
 | inferenceservice_yaml | `{}` | Raw InferenceService serialized YAML for deployment. Use this if you need additional configurations for your InferenceService. |
 | autoscaling_target | `0` | Autoscaling Target Number. If not 0, sets the following annotation on the InferenceService: `autoscaling.knative.dev/target` |
@@ -185,4 +188,3 @@ kserve_op(
     inferenceservice_yaml=isvc_yaml
 )
 ```
-
diff --git a/components/kserve/component.yaml b/components/kserve/component.yaml
@@ -1,25 +1,28 @@
 name: Serve a model with KServe 
 description: Serve Models using KServe 
 inputs:
-  - {name: Action,                    type: String, default: 'create',     description: 'Action to execute on KServe'}
-  - {name: Model Name,                type: String, default: '',           description: 'Name to give to the deployed model'}
-  - {name: Model URI,                 type: String, default: '',           description: 'Path of the S3 or GCS compatible directory containing the model.'}
-  - {name: Canary Traffic Percent,    type: String, default: '100',        description: 'The traffic split percentage between the candidate model and the last ready model'}
-  - {name: Namespace,                 type: String, default: '',           description: 'Kubernetes namespace where the KServe service is deployed.'}
-  - {name: Framework,                 type: String, default: '',           description: 'Machine Learning Framework for Model Serving.'}
-  - {name: Custom Model Spec,         type: String, default: '{}',         description: 'Custom model runtime container spec in JSON'}
-  - {name: Autoscaling Target,        type: String, default: '0',          description: 'Autoscaling Target Number'}
-  - {name: Service Account,           type: String, default: '',           description: 'ServiceAccount to use to run the InferenceService pod'}
-  - {name: Enable Istio Sidecar,      type: Bool,   default: 'True',       description: 'Whether to enable istio sidecar injection'}
-  - {name: InferenceService YAML,     type: String, default: '{}',         description: 'Raw InferenceService serialized YAML for deployment'}
-  - {name: Watch Timeout,             type: String, default: '300',        description: "Timeout seconds for watching until InferenceService becomes ready."}
-  - {name: Min Replicas,              type: String, default: '-1',         description: 'Minimum number of InferenceService replicas'}
-  - {name: Max Replicas,              type: String, default: '-1',         description: 'Maximum number of InferenceService replicas'}
-  - {name: Request Timeout,           type: String, default: '60',         description: "Specifies the number of seconds to wait before timing out a request to the component."}
-  - {name: Enable ISVC Status,        type: Bool,   default: 'True',       description: "Specifies whether to store the inference service status as the output parameter"}
+  - {name: Action,                    type: String, default: 'create',                            description: 'Action to execute on KServe'}
+  - {name: Model Name,                type: String, default: '',                                  description: 'Name to give to the deployed model'}
+  - {name: Model URI,                 type: String, default: '',                                  description: 'Path of the S3 or GCS compatible directory containing the model.'}
+  - {name: Canary Traffic Percent,    type: String, default: '100',                               description: 'The traffic split percentage between the candidate model and the last ready model'}
+  - {name: Namespace,                 type: String, default: '',                                  description: 'Kubernetes namespace where the KServe service is deployed.'}
+  - {name: Framework,                 type: String, default: '',                                  description: 'Machine Learning Framework for Model Serving.'}
+  - {name: Runtime Version,           type: String, default: 'latest',                            description: 'Runtime Version of Machine Learning Framework'}
+  - {name: Resource Requests,         type: String, default: '{"cpu": "0.5", "memory": "512Mi"}', description: 'CPU and Memory requests for Model Serving'}
+  - {name: Resource Limits,           type: String, default: '{"cpu": "1", "memory": "1Gi"}',     description: 'CPU and Memory limits for Model Serving'}
+  - {name: Custom Model Spec,         type: String, default: '{}',                                description: 'Custom model runtime container spec in JSON'}
+  - {name: Autoscaling Target,        type: String, default: '0',                                 description: 'Autoscaling Target Number'}
+  - {name: Service Account,           type: String, default: '',                                  description: 'ServiceAccount to use to run the InferenceService pod'}
+  - {name: Enable Istio Sidecar,      type: Bool,   default: 'True',                              description: 'Whether to enable istio sidecar injection'}
+  - {name: InferenceService YAML,     type: String, default: '{}',                                description: 'Raw InferenceService serialized YAML for deployment'}
+  - {name: Watch Timeout,             type: String, default: '300',                               description: "Timeout seconds for watching until InferenceService becomes ready."}
+  - {name: Min Replicas,              type: String, default: '-1',                                description: 'Minimum number of InferenceService replicas'}
+  - {name: Max Replicas,              type: String, default: '-1',                                description: 'Maximum number of InferenceService replicas'}
+  - {name: Request Timeout,           type: String, default: '60',                                description: "Specifies the number of seconds to wait before timing out a request to the component."}
+  - {name: Enable ISVC Status,        type: Bool,   default: 'True',                              description: "Specifies whether to store the inference service status as the output parameter"}
 
 outputs:
-  - {name: InferenceService Status,   type: String,                        description: 'Status JSON output of InferenceService'}
+  - {name: InferenceService Status,   type: String,                                               description: 'Status JSON output of InferenceService'}
 implementation:
   container:
     image: quay.io/aipipeline/kserve-component:v0.11.1
@@ -32,6 +35,9 @@ implementation:
       --canary-traffic-percent, {inputValue: Canary Traffic Percent},
       --namespace,              {inputValue: Namespace},
       --framework,              {inputValue: Framework},
+      --runtime-version,        {inputValue: Runtime Version},
+      --resource-requests,      {inputValue: Resource Requests},
+      --resource-limits,        {inputValue: Resource Limits},
       --custom-model-spec,      {inputValue: Custom Model Spec},
       --autoscaling-target,     {inputValue: Autoscaling Target},
       --service-account,        {inputValue: Service Account},

diff --git a/components/kserve/src/kservedeployer.py b/components/kserve/src/kservedeployer.py
@@ -21,6 +21,7 @@
 import yaml
 
 from kubernetes import client
+from kubernetes.client.models import V1ResourceRequirements
 
 from kserve import constants
 from kserve import KServeClient
@@ -50,8 +51,9 @@
 }
 
 
-def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
-                          service_account, min_replicas, max_replicas, containers, request_timeout):
+def create_predictor_spec(framework, runtime_version, resource_requests, resource_limits, 
+                          storage_uri, canary_traffic_percent, service_account, min_replicas, 
+                          max_replicas, containers, request_timeout):
     """
     Create and return V1beta1PredictorSpec to be used in a V1beta1InferenceServiceSpec
     object.
@@ -81,7 +83,14 @@ def create_predictor_spec(framework, storage_uri, canary_traffic_percent,
     setattr(
         predictor_spec,
         framework,
-        AVAILABLE_FRAMEWORKS[framework](storage_uri=storage_uri)
+        AVAILABLE_FRAMEWORKS[framework](
+            storage_uri=storage_uri, 
+            resources=V1ResourceRequirements(
+                requests=resource_requests,
+                limits=resource_limits
+            ),
+            runtime_version=runtime_version
+        )
     )
     return predictor_spec
 
@@ -178,10 +187,10 @@ def submit_api_request(kserve_client, action, name, isvc, namespace=None,
         return outputs
 
 
-def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace,
-                   framework, custom_model_spec, service_account, inferenceservice_yaml,
-                   request_timeout, autoscaling_target=0, enable_istio_sidecar=True,
-                   watch_timeout=300, min_replicas=0, max_replicas=0):
+def perform_action(action, model_name, model_uri, canary_traffic_percent, namespace, framework, 
+                   runtime_version, resource_requests, resource_limits, custom_model_spec, 
+                   service_account, inferenceservice_yaml, request_timeout, autoscaling_target=0, 
+                   enable_istio_sidecar=True, watch_timeout=300, min_replicas=0, max_replicas=0):
     """
     Perform the specified action. If the action is not 'delete' and `inferenceService_yaml`
     was provided, the dict representation of the YAML will be sent directly to the
@@ -224,8 +233,9 @@ def perform_action(action, model_name, model_uri, canary_traffic_percent, namesp
 
         # Build the V1beta1PredictorSpec.
         predictor_spec = create_predictor_spec(
-            framework, model_uri, canary_traffic_percent, service_account,
-            min_replicas, max_replicas, containers, request_timeout
+            framework, runtime_version, resource_requests, resource_limits, 
+            model_uri, canary_traffic_percent, service_account, min_replicas, 
+            max_replicas, containers, request_timeout
         )
 
         isvc = create_inference_service(metadata, predictor_spec)
@@ -287,6 +297,24 @@ def main():
              str(list(AVAILABLE_FRAMEWORKS.keys())),
         default=""
     )
+    parser.add_argument(
+        "--runtime-version",
+        type=str,
+        help="Runtime Version of Machine Learning Framework",
+        default="latest"
+    )
+    parser.add_argument(
+        "--resource-requests",
+        type=json.loads,
+        help="CPU and Memory requests for Model Serving",
+        default='{"cpu": "0.5", "memory": "512Mi"}',
+    )
+    parser.add_argument(
+        "--resource-limits",
+        type=json.loads,
+        help="CPU and Memory limits for Model Serving",
+        default='{"cpu": "1", "memory": "1Gi"}',
+    )
     parser.add_argument(
         "--custom-model-spec",
         type=json.loads,
@@ -342,6 +370,9 @@ def main():
     canary_traffic_percent = int(args.canary_traffic_percent)
     namespace = args.namespace
     framework = args.framework.lower()
+    runtime_version = args.runtime_version.lower()
+    resource_requests = args.resource_requests
+    resource_limits = args.resource_limits
     output_path = args.output_path
     custom_model_spec = args.custom_model_spec
     autoscaling_target = int(args.autoscaling_target)
@@ -381,6 +412,9 @@ def main():
         canary_traffic_percent=canary_traffic_percent,
         namespace=namespace,
         framework=framework,
+        runtime_version=runtime_version,
+        resource_requests=resource_requests,
+        resource_limits=resource_limits,
         custom_model_spec=custom_model_spec,
         autoscaling_target=autoscaling_target,
         service_account=service_account,