Add HPA support to embedding, reranking, tgi services

Signed-off-by: Alexey Fomenko <alexey.fomenko@intel.com>
opea-project · Aug 21, 2024 · a6b42bc · a6b42bc
1 parent b1182c4
commit a6b42bc
Show file tree

Hide file tree

Showing 20 changed files with 403 additions and 0 deletions.
diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: v1
+data:
+  config.yaml: |
+      rules:
+      - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
+        # Average request latency from TGI histograms, over 1 min
+        # (0.001 divider add is to make sure there's always a valid value)
+        metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^tgi_request_inference_duration_sum
+          as: "tgi_request_latency"
+        resources:
+          # HPA needs both namespace + suitable object resource for its query paths:
+          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
+          # (pod is not suitable object type for matching as each instance has different name)
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "reranking_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
+        # Average request latency from TEI histograms, over 1 min
+        metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
+        name:
+          matches: ^te_request_inference_duration_sum
+          as: "embedding_request_latency"
+        resources:
+          overrides:
+            namespace:
+              resource: namespace
+            service:
+              resource: service
+kind: ConfigMap
+metadata:
+  name: adapter-config
+  namespace: monitoring
+{{- end }}
diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
@@ -7,6 +7,14 @@
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
+#   for embedding, reranking, tgi services
+# Upstream default configMap:
+#  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
+horizontalPodAutoscaler:
+  enabled: false
+
 image:
   repository: opea/chatqna
   pullPolicy: IfNotPresent

diff --git a/helm-charts/common/embedding-usvc/README.md b/helm-charts/common/embedding-usvc/README.md
@@ -27,6 +27,34 @@ helm dependency update
 helm install embedding-usvc . --set autodependency.enabled=true
 ```
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+### Gotchas
+
+Why HPA is opt-in:
+* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces.  If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tune for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are running.

diff --git a/helm-charts/common/embedding-usvc/templates/_helpers.tpl b/helm-charts/common/embedding-usvc/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "embedding-usvc.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "embedding-usvc.fullname" . }}
 {{- end }}
 
 {{/*

diff --git a/helm-charts/common/embedding-usvc/templates/deployment.yaml b/helm-charts/common/embedding-usvc/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "embedding-usvc.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
@@ -62,6 +65,10 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          # extra time to finish processing buffered requests before HPA forcibly terminates pod
+          terminationGracePeriodSeconds: 60
+          {{- end }}
       volumes:
         - name: tmp
           emptyDir: {}

diff --git a/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "embedding-usvc.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-embedding time metrics are in seconds
+        name: embedding_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-embedding-svc
+      target:
+        # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "embedding-usvc.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "embedding-usvc.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/embedding-usvc/values.yaml b/helm-charts/common/embedding-usvc/values.yaml
@@ -10,6 +10,14 @@ autodependency:
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 2
+
 TEI_EMBEDDING_ENDPOINT: ""
 image:
   repository: opea/embedding-tei

diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md
@@ -21,6 +21,34 @@ MODELDIR=/mnt/opea-models
 
 MODELNAME="/data/BAAI/bge-reranker-base"
 
+## HorizontalPodAutoscaler (HPA) support
+
+`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
+https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/
+
+Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).
+
+### Pre-conditions
+
+If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
+yet, it SHOULD be be installed before enabling HPA, e.g. by using:
+https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
+
+### Gotchas
+
+Why HPA is opt-in:
+* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
+  `PrometheusAdapter` configuration with its own custom metrics configuration.
+  Take copy of the existing one before install, if that matters:
+  `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
+* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
+  `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
+* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
+  for accessing metrics from `default`, `kube-system` and `monitoring` namespaces.  If Helm is
+  asked to install OPEA services to some other namespace, those rules need to be updated accordingly
+* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tune for given setup
+  (underlying HW, used models, OPEA version etc)
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng.

diff --git a/helm-charts/common/teirerank/templates/_helpers.tpl b/helm-charts/common/teirerank/templates/_helpers.tpl
@@ -40,6 +40,7 @@ helm.sh/chart: {{ include "teirerank.chart" . }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
+svc: {{ include "teirerank.fullname" . }}
 {{- end }}
 
 {{/*

diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "teirerank.labels" . | nindent 4 }}
 spec:
+  # use explicit replica counts only of HorizontalPodAutoscaler is disabled
+  {{- if not .Values.horizontalPodAutoscaler.enabled }}
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "teirerank.selectorLabels" . | nindent 6 }}
@@ -72,6 +75,10 @@ spec:
           {{- end }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
+          {{- if .Values.horizontalPodAutoscaler.enabled }}
+          # extra time to finish processing buffered requests before HPA forcibly terminates pod
+          terminationGracePeriodSeconds: 60
+          {{- end }}
       volumes:
         - name: model-volume
           {{- if .Values.global.modelUsePVC }}

diff --git a/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "teirerank.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "teirerank.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      metric:
+        # tei-reranking time metrics are in seconds
+        name: reranking_request_latency
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: tei-reranking-svc
+      target:
+        # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
+        # TEI startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        type: Value
+        value: 4
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 15
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 15
+      - type: Pods
+        value: 2
+        periodSeconds: 15
+{{- end }}
diff --git a/helm-charts/common/teirerank/templates/servicemonitor.yaml b/helm-charts/common/teirerank/templates/servicemonitor.yaml
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.horizontalPodAutoscaler.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "teirerank.fullname" . }}
+spec:
+  selector:
+    matchLabels:
+      svc: {{ include "teirerank.fullname" . }}
+  endpoints:
+  - interval: 4s
+    port: service
+    scheme: http
+{{- end }}
diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml
@@ -7,6 +7,15 @@
 
 replicaCount: 1
 
+
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Require custom metrics ConfigMap available in the main application chart
+horizontalPodAutoscaler:
+  enabled: false
+  maxReplicas: 3
+
 port: 2082
 shmSize: 1Gi
 RERANK_MODEL_ID: "BAAI/bge-reranker-base"