From c930b00d0a642f27155098e0ed04fa3253cd1db7 Mon Sep 17 00:00:00 2001
From: Lucian Carata <lcr@bitrunes.org>
Date: Mon, 23 Sep 2024 18:33:09 +0100
Subject: [PATCH] add(operator): Model selector for scale subresource to enable
 HPA-based scaling

- updates the Model CRD to contain a pod selector in the scale subresource
- sets the selector to a label `server=[inference-server-name]` matching no actual pods
- docs [to be moved to gitbook before merging]
---
 .../contents/architecture/hpa-autoscaling.md  | 287 ++++++++++++++++++
 .../templates/seldon-v2-crds.yaml             |   5 +
 k8s/yaml/crds.yaml                            |   5 +
 k8s/yaml/runtime.yaml                         |   6 +
 k8s/yaml/servers.yaml                         |   2 +
 operator/apis/mlops/v1alpha1/model_types.go   |   5 +-
 .../crd/bases/mlops.seldon.io_models.yaml     |   5 +
 operator/scheduler/model.go                   |   1 +
 8 files changed, 314 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/contents/architecture/hpa-autoscaling.md

diff --git a/docs/source/contents/architecture/hpa-autoscaling.md b/docs/source/contents/architecture/hpa-autoscaling.md
new file mode 100644
index 0000000000..45dfc480e2
--- /dev/null
+++ b/docs/source/contents/architecture/hpa-autoscaling.md
@@ -0,0 +1,287 @@
+# Autoscaling single-model serving based on model RPS, using HorizontalPodAutoscaler
+
+The goal is to autoscale model and server replicas based on model inference RPS. This will require:
+
+- Having a Seldon Core 2 install that publishes metrics to prometheus (default). In the following, we will assume that prometheus is installed in a separate namespace, `seldon-monitoring`
+- Installing and configuring [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter), which allows prometheus queries on relevant metrics to be published as k8s custom metrics
+- Configuring HPA manifests to scale Models and the corresponding Server replicas
+
+### Installing and configuring the Prometheus Adapter
+
+The role of the Prometheus Adapter is to expose queries on metrics in prometheus as k8s custom or external metrics. Those can then be accessed by HPA in order to take scaling decisions.
+
+To install via helm:
+
+```bash
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+helm install --set prometheus.url='http://seldon-monitoring-prometheus' hpa-metrics prometheus-community/prometheus-adapter -n seldon-monitoring
+```
+
+In the commands above, we install `prometheus-adapter` as a helm release named `hpa-metrics` in the same namespace as our prometheus install, and point to its service URL (without the port).
+
+If running prometheus on a different port than the default 9090, you can also pass `--set prometheus.port=[custom_port]` You may inspect all the options available as helm values by running `helm show values prometheus-community/prometheus-adapter`
+
+We now need to configure the adapter to look for the correct prometheus metrics and compute per-model RPS values. On install, the adapter has created a `ConfigMap` in the same namespace as itself, named `[helm_release_name]-prometheus-adapter`. In our case, it will be `hpa-metrics-prometheus-adapter`.
+
+We want to overwrite this ConfigMap with the content below (please change the name if your helm release has a different one). The manifest contains embedded documentation, highlighting how we match the `seldon_model_infer_total`  metric in Prometheus, compute a rate via a `metricsQuery` and expose this to k8s as the `infer_rps` metric, on a per (model, namespace) basis.
+
+Other aggregations on per (server, namespace) and (pod, namespace) are also exposed and may be used in HPA, but we will focus on the (model, namespace) aggregation in the examples below.
+
+You may want to modify some of the settings to match the prometheus query that you typically use for RPS metrics. For example, the `metricsQuery` below computes the RPS by calling [`rate()`](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate) with a 1 minute window.
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: hpa-metrics-prometheus-adapter
+  namespace: seldon-monitoring
+data:
+  config.yaml: |-
+    "rules":
+    # Rule matching Seldon inference requests-per-second metrics and exposing aggregations for
+    # specific k8s models, servers, pods and namespaces
+    #
+    # Uses the prometheus-side `seldon_model_(.*)_total` inference request count metrics to
+    # compute and expose k8s custom metrics on inference RPS `${1}_rps`. A prometheus metric named
+    # `seldon_model_infer_total` will be exposed as multiple `[group-by-k8s-resource]/infer_rps`
+    # k8s metrics, for consumption by HPA.
+    #
+    # One k8s metric is generated for each k8s resource associated with a prometheus metric, as
+    # defined in the "Association" section below. Because this association is defined based on
+    # labels present in the prometheus metric, the number of generated k8s metrics will vary
+    # depending on what labels are available in each discovered prometheus metric.
+    #
+    # The resources associated through this rule (when available as labels for each of the
+    # discovered prometheus metrics) are:
+    # - models
+    # - servers
+    # - pods (inference server pods)
+    # - namespaces
+    #
+    # For example, you will get aggregated metrics for `models.mlops.seldon.io/iris0/infer_rps`,
+    # `servers.mlops.seldon.io/mlserver/infer_rps`, `pods/mlserver-0/infer_rps`,
+    # `namespaces/seldon-mesh/infer_rps`
+    #
+    # Metrics associated with any resource except the namespace one (models, servers and pods)
+    # need to be requested in the context of a particular namespace.
+    #
+    # To fetch those k8s metrics manually once the prometheus-adapter is running, you can run:
+    #
+    # For "namespaced" resources, i.e. models, servers and pods (replace values in brackets):
+    # ```
+    # kubectl get --raw
+    # "/apis/custom.metrics.k8s.io/v1beta1/namespaces/[NAMESPACE]/[RESOURCE_NAME]/[CR_NAME]/infer_rps"
+    # ```
+    #
+    # For example:
+    # ```
+    # kubectl get --raw
+    # "/apis/custom.metrics.k8s.io/v1beta1/namespaces/seldon-mesh/models.mlops.seldon.io/iris0/infer_rps"
+    # ```
+    #
+    # For the namespace resource, you can get the namespace-level aggregation of the metric with:
+    # ```
+    # kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/*/metrics/infer_rps"
+    # ```
+    -
+      # Metric discovery: selects subset of metrics exposed in Prometheus, based on name and
+      # filters
+      "seriesQuery": |
+         {__name__=~"^seldon_model.*_total",namespace!=""}
+      "seriesFilters":
+        - "isNot": "^seldon_.*_seconds_total"
+        - "isNot": "^seldon_.*_aggregate_.*"
+      # Association: maps label values in the Prometheus metric to K8s resources (native or CRs)
+      # Below, we associate the "model" prometheus metric label to the corresponding Seldon Model
+      # CR, the "server" label to the Seldon Server CR, etc.
+      "resources":
+        "overrides":
+          "model": {group: "mlops.seldon.io", resource: "model"}
+          "server": {group: "mlops.seldon.io", resource: "server"}
+          "pod": {resource: "pod"}
+          "namespace": {resource: "namespace"}
+      # Rename prometheus metrics to get k8s metric names that reflect the processing done via
+      # the query applied to those metrics (actual query below under the "metricsQuery" key)
+      "name":
+        "matches": "^seldon_model_(.*)_total"
+        "as": "${1}_rps"
+      # The actual query to be executed against Prometheus to retrieve the metric value
+      # Here:
+      #   - .Series is replaced by the discovered prometheus metric name (e.g.
+      #     `seldon_model_infer_total`)
+      #   - .LabelMatchers, when requesting a metric for a namespaced resource X with name x in
+      #     namespace n, is replaced by `X=~"x",namespace="n"`. For example, `model=~"iris0",
+      #     namespace="seldon-mesh"`. When requesting the namespace resource itself, only the
+      #     `namespace="n"` is kept.
+      #   - .GroupBy is replaced by the resource type of the requested metric (e.g. `model`,
+      #     `server`, `pod` or `namespace`).
+      "metricsQuery": |
+        sum by (<<.GroupBy>>) (
+          rate (
+            <<.Series>>{<<.LabelMatchers>>}[1m]
+          )
+        )
+```
+
+Apply the config, and restart the prometheus adapter deployment (this restart is required so that prometheus-adapter picks up the new config):
+
+```bash
+# Apply prometheus adapter config
+kubectl apply -f prometheus-adapter.config.yaml
+# Restart prom-adapter pods
+kubectl rollout restart deployment hpa-metrics-prometheus-adapter -n seldon-monitoring
+```
+
+In order to test that the prometheus adapter config works and everything is set up correctly, you can issue raw kubectl requests against the custom metrics API, as described below.
+
+If no inference requests were issued towards any model in the Seldon install, the metrics configured above will not be available in prometheus, and thus will also not appear when checking via the commands below. Therefore, please first run some inference requests towards a sample model to ensure that the metrics are available — this is only required for the testing of the install.
+
+
+**Testing the prometheus-adapter install using the custom metrics API**
+
+List available metrics
+
+```bash
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/ | jq .
+```
+
+Fetching model RPS metric for specific (namespace, model) pair:
+
+```bash
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/seldon-mesh/models.mlops.seldon.io/irisa0/infer_rps
+```
+
+Fetching model RPS metric aggregated at the (namespace, server) level:
+
+```bash
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/seldon-mesh/servers.mlops.seldon.io/mlserver/infer_rps
+```
+
+Fetching model RPS metric aggregated at the (namespace, pod) level:
+
+```bash
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/seldon-mesh/pods/mlserver-0/infer_rps
+```
+
+Fetching the same metric aggregated at namespace level:
+
+```bash
+kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/*/metrics/infer_rps
+```
+
+### Configuring HPA manifests
+
+For every (Model, Server) pair you want to autoscale, you need to apply 2 HPA manifests based on the same metric: one scaling the Model, the other the Server. The example below only works if the mapping between Models and Servers is 1-to-1 (i.e no multi-model serving).
+
+Consider a model named `irisa0` with the following manifest. Please note we don’t set `minReplicas/maxReplicas` this is in order to disable the seldon-specific autoscaling so that it doesn’t interact with HPA.
+
+```yaml
+apiVersion: mlops.seldon.io/v1alpha1
+kind: Model
+metadata:
+  name: irisa0
+  namespace: seldon-mesh
+spec:
+  memory: 3M
+  replicas: 1
+  requirements:
+  - sklearn
+  storageUri: gs://seldon-models/testing/iris1
+```
+
+Let’s scale this model when it is deployed on a server named `mlserver`, with a target average RPS per replica of 3 RPS (higher RPS would trigger scale-up, lower would trigger scale-down):
+
+```yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: irisa0-model-hpa
+  namespace: seldon-mesh
+spec:
+  scaleTargetRef:
+    apiVersion: mlops.seldon.io/v1alpha1
+    kind: Model
+    name: irisa0
+  minReplicas: 1
+  maxReplicas: 3
+  metrics:
+  - type: Object
+    object:
+      metric:
+        name: infer_rps
+      describedObject:
+        apiVersion: mlops.seldon.io/v1alpha1
+        kind: Model
+        name: irisa0
+      target:
+        type: AverageValue
+        averageValue: 3
+---
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: mlserver-server-hpa
+  namespace: seldon-mesh
+spec:
+  scaleTargetRef:
+    apiVersion: mlops.seldon.io/v1alpha1
+    kind: Server
+    name: mlserver
+  minReplicas: 1
+  maxReplicas: 3
+  metrics:
+  - type: Object
+    object:
+      metric:
+        name: infer_rps
+      describedObject:
+        apiVersion: mlops.seldon.io/v1alpha1
+        kind: Model
+        name: irisa0
+      target:
+        type: AverageValue
+        averageValue: 3
+```
+
+In the two HPA manifests above, the scaling metric is exactly the same, and uses the exact same parameters: this is to ensure that both the Models and the Servers are scaled up/down at approximately same time. Similarly, we will want to keep the number of minReplicas and maxReplicas in sync across the (Model, Server) pair.
+
+Please note that you **must** use a `target.type` of `AverageValue`.
+
+Attempting other target types will not work under the current Seldon Core v2 setup, due to the way in which HPA works, requiring an unique mapping between CRs and actual underlying pods. This is not the case above as we would need to map both the Model CR and the Server CR to the same underlying pod.
+
+
+**Advanced settings:**
+
+- Filtering metrics by other labels on the prometheus metric
+
+    The prometheus metric from which the model RPS is computed has the following labels:
+
+    ```yaml
+    seldon_model_infer_total{**code**="200", **container**="agent", **endpoint**="metrics", **instance**="10.244.0.39:9006", **job**="seldon-mesh/agent", **method_type**="rest", **model**="irisa0", **model_internal**="irisa0_1", **namespace**="seldon-mesh", **pod**="mlserver-0", **server**="mlserver", **server_replica**="0"}
+    ```
+
+    If we wanted the scaling metric to be computed based on inferences with a particular value for those metrics, we can add this in the HPA metric config, as in the example below (targeting `method_type="rest"`):
+
+    ```yaml
+      metrics:
+      - type: Object
+    	  object:
+          describedObject:
+            apiVersion: mlops.seldon.io/v1alpha1
+            kind: Model
+            name: irisa0
+          metric:
+            name: infer_rps
+            selector:
+              matchLabels:
+                method_type: rest
+          target:
+    	      type: AverageValue
+            averageValue: "3"
+    ```
+
+
+- Customise scale-up / scale-down rate & properties by using scaling policies as described in the [HPA scaling policies docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#configurable-scaling-behavior)
+
+    For more resources, please consult the [HPA docs](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) and the [HPA walkthrough](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/)
diff --git a/k8s/helm-charts/seldon-core-v2-crds/templates/seldon-v2-crds.yaml b/k8s/helm-charts/seldon-core-v2-crds/templates/seldon-v2-crds.yaml
index c6e2f2e443..cdccc3775f 100644
--- a/k8s/helm-charts/seldon-core-v2-crds/templates/seldon-v2-crds.yaml
+++ b/k8s/helm-charts/seldon-core-v2-crds/templates/seldon-v2-crds.yaml
@@ -341,12 +341,17 @@ spec:
                 description: Total number of replicas targeted by this model
                 format: int32
                 type: integer
+              selector:
+                type: string
+            required:
+            - selector
             type: object
         type: object
     served: true
     storage: true
     subresources:
       scale:
+        labelSelectorPath: .status.selector
         specReplicasPath: .spec.replicas
         statusReplicasPath: .status.replicas
       status: {}
diff --git a/k8s/yaml/crds.yaml b/k8s/yaml/crds.yaml
index 99ac05ff19..7e6c5c236e 100644
--- a/k8s/yaml/crds.yaml
+++ b/k8s/yaml/crds.yaml
@@ -344,12 +344,17 @@ spec:
                 description: Total number of replicas targeted by this model
                 format: int32
                 type: integer
+              selector:
+                type: string
+            required:
+            - selector
             type: object
         type: object
     served: true
     storage: true
     subresources:
       scale:
+        labelSelectorPath: .status.selector
         specReplicasPath: .spec.replicas
         statusReplicasPath: .status.replicas
       status: {}
diff --git a/k8s/yaml/runtime.yaml b/k8s/yaml/runtime.yaml
index 840617fed5..1951f26745 100644
--- a/k8s/yaml/runtime.yaml
+++ b/k8s/yaml/runtime.yaml
@@ -24,22 +24,28 @@ spec:
   - name: hodometer
     disable: false
     replicas: 1
+    podSpec: null
   - name: seldon-scheduler
     disable: false
     serviceType: LoadBalancer
+    podSpec: null
   - name: seldon-envoy
     disable: false
     replicas: 1
     serviceType: LoadBalancer
+    podSpec: null
   - name: seldon-dataflow-engine
     disable: false
     replicas: 1
+    podSpec: null
   - name: seldon-modelgateway
     disable: false
     replicas: 1
+    podSpec: null
   - name: seldon-pipelinegateway
     disable: false
     replicas: 1
+    podSpec: null
   config:
     agentConfig:
       rclone:
diff --git a/k8s/yaml/servers.yaml b/k8s/yaml/servers.yaml
index acc11ee2c9..b33488932c 100644
--- a/k8s/yaml/servers.yaml
+++ b/k8s/yaml/servers.yaml
@@ -5,6 +5,7 @@ kind: Server
 metadata:
   name: mlserver
 spec:
+  podSpec: null
   replicas: 1
   serverConfig: mlserver
 ---
@@ -14,5 +15,6 @@ kind: Server
 metadata:
   name: triton
 spec:
+  podSpec: null
   replicas: 1
   serverConfig: triton
diff --git a/operator/apis/mlops/v1alpha1/model_types.go b/operator/apis/mlops/v1alpha1/model_types.go
index cf66e11b1c..0340497b20 100644
--- a/operator/apis/mlops/v1alpha1/model_types.go
+++ b/operator/apis/mlops/v1alpha1/model_types.go
@@ -106,14 +106,15 @@ type InferenceArtifactSpec struct {
 // ModelStatus defines the observed state of Model
 type ModelStatus struct {
 	// Total number of replicas targeted by this model
-	Replicas      int32 `json:"replicas,omitempty"`
+	Replicas      int32  `json:"replicas,omitempty"`
+	Selector      string `json:"selector"`
 	duckv1.Status `json:",inline"`
 }
 
 //+kubebuilder:object:root=true
 //+kubebuilder:subresource:status
 //+kubebuilder:resource:shortName=mlm
-//+kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas
+//+kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector
 //+kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="ModelReady")].status`,description="Model ready status"
 //+kubebuilder:printcolumn:name="Replicas",type=integer,JSONPath=`.status.replicas`, description="Number of replicas"
 //+kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
diff --git a/operator/config/crd/bases/mlops.seldon.io_models.yaml b/operator/config/crd/bases/mlops.seldon.io_models.yaml
index fdc2435556..72b2825f96 100644
--- a/operator/config/crd/bases/mlops.seldon.io_models.yaml
+++ b/operator/config/crd/bases/mlops.seldon.io_models.yaml
@@ -193,12 +193,17 @@ spec:
                 description: Total number of replicas targeted by this model
                 format: int32
                 type: integer
+              selector:
+                type: string
+            required:
+            - selector
             type: object
         type: object
     served: true
     storage: true
     subresources:
       scale:
+        labelSelectorPath: .status.selector
         specReplicasPath: .spec.replicas
         statusReplicasPath: .status.replicas
       status: {}
diff --git a/operator/scheduler/model.go b/operator/scheduler/model.go
index 55fcd0db9d..72c4ba935d 100644
--- a/operator/scheduler/model.go
+++ b/operator/scheduler/model.go
@@ -259,6 +259,7 @@ func (s *SchedulerClient) SubscribeModelEvents(ctx context.Context, grpcClient s
 				modelStatus.GetAvailableReplicas() +
 					modelStatus.GetUnavailableReplicas(),
 			)
+			latestModel.Status.Selector = "server=" + latestVersionStatus.ServerName
 			return s.updateModelStatus(latestModel)
 		})
 		if retryErr != nil {