Skip to content

Commit

Permalink
Add HPA support to embedding, reranking, tgi services
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Fomenko <alexey.fomenko@intel.com>
  • Loading branch information
byako committed Aug 21, 2024
1 parent b1182c4 commit a6b42bc
Show file tree
Hide file tree
Showing 20 changed files with 403 additions and 0 deletions.
53 changes: 53 additions & 0 deletions helm-charts/chatqna/templates/customMetrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: v1
data:
config.yaml: |
rules:
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "tgi_request_latency"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "reranking_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "embedding_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
kind: ConfigMap
metadata:
name: adapter-config
namespace: monitoring
{{- end }}
8 changes: 8 additions & 0 deletions helm-charts/chatqna/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@

replicaCount: 1

# Enabling HPA will:
# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries
# for embedding, reranking, tgi services
# Upstream default configMap:
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
horizontalPodAutoscaler:
enabled: false

image:
repository: opea/chatqna
pullPolicy: IfNotPresent
Expand Down
28 changes: 28 additions & 0 deletions helm-charts/common/embedding-usvc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,34 @@ helm dependency update
helm install embedding-usvc . --set autodependency.enabled=true
```

## HorizontalPodAutoscaler (HPA) support

`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/

Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).

### Pre-conditions

If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
yet, it SHOULD be be installed before enabling HPA, e.g. by using:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack

### Gotchas

Why HPA is opt-in:
* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
`PrometheusAdapter` configuration with its own custom metrics configuration.
Take copy of the existing one before install, if that matters:
`kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
`ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is
asked to install OPEA services to some other namespace, those rules need to be updated accordingly
* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tune for given setup
(underlying HW, used models, OPEA version etc)

## Verify

To verify the installation, run the command `kubectl get pod` to make sure all pods are running.
Expand Down
1 change: 1 addition & 0 deletions helm-charts/common/embedding-usvc/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "embedding-usvc.chart" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
svc: {{ include "embedding-usvc.fullname" . }}
{{- end }}

{{/*
Expand Down
7 changes: 7 additions & 0 deletions helm-charts/common/embedding-usvc/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "embedding-usvc.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
Expand Down Expand Up @@ -62,6 +65,10 @@ spec:
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
{{- end }}
volumes:
- name: tmp
emptyDir: {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "embedding-usvc.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "embedding-usvc.fullname" . }}
minReplicas: 1
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
metrics:
- type: Object
object:
metric:
# tei-embedding time metrics are in seconds
name: embedding_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei-embedding-svc
target:
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
17 changes: 17 additions & 0 deletions helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "embedding-usvc.fullname" . }}
spec:
selector:
matchLabels:
svc: {{ include "embedding-usvc.fullname" . }}
endpoints:
- interval: 4s
port: service
scheme: http
{{- end }}
8 changes: 8 additions & 0 deletions helm-charts/common/embedding-usvc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ autodependency:

replicaCount: 1

# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 2

TEI_EMBEDDING_ENDPOINT: ""
image:
repository: opea/embedding-tei
Expand Down
28 changes: 28 additions & 0 deletions helm-charts/common/teirerank/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,34 @@ MODELDIR=/mnt/opea-models

MODELNAME="/data/BAAI/bge-reranker-base"

## HorizontalPodAutoscaler (HPA) support

`horizontalPodAutoscaler` option enables HPA scaling for the deployment:
https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/

Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/).

### Pre-conditions

If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus)
yet, it SHOULD be be installed before enabling HPA, e.g. by using:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack

### Gotchas

Why HPA is opt-in:
* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current
`PrometheusAdapter` configuration with its own custom metrics configuration.
Take copy of the existing one before install, if that matters:
`kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml`
* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration:
`ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)`
* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml)
for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is
asked to install OPEA services to some other namespace, those rules need to be updated accordingly
* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tune for given setup
(underlying HW, used models, OPEA version etc)

## Verify

To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng.
Expand Down
1 change: 1 addition & 0 deletions helm-charts/common/teirerank/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ helm.sh/chart: {{ include "teirerank.chart" . }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
svc: {{ include "teirerank.fullname" . }}
{{- end }}

{{/*
Expand Down
7 changes: 7 additions & 0 deletions helm-charts/common/teirerank/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "teirerank.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "teirerank.selectorLabels" . | nindent 6 }}
Expand Down Expand Up @@ -72,6 +75,10 @@ spec:
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
{{- if .Values.horizontalPodAutoscaler.enabled }}
# extra time to finish processing buffered requests before HPA forcibly terminates pod
terminationGracePeriodSeconds: 60
{{- end }}
volumes:
- name: model-volume
{{- if .Values.global.modelUsePVC }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "teirerank.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "teirerank.fullname" . }}
minReplicas: 1
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }}
metrics:
- type: Object
object:
metric:
# tei-reranking time metrics are in seconds
name: reranking_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei-reranking-svc
target:
# reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
17 changes: 17 additions & 0 deletions helm-charts/common/teirerank/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "teirerank.fullname" . }}
spec:
selector:
matchLabels:
svc: {{ include "teirerank.fullname" . }}
endpoints:
- interval: 4s
port: service
scheme: http
{{- end }}
9 changes: 9 additions & 0 deletions helm-charts/common/teirerank/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@

replicaCount: 1


# Enabling HPA will:
# - Ignore above replica count, as it will be controlled by HPA
# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
# - Require custom metrics ConfigMap available in the main application chart
horizontalPodAutoscaler:
enabled: false
maxReplicas: 3

port: 2082
shmSize: 1Gi
RERANK_MODEL_ID: "BAAI/bge-reranker-base"
Expand Down
Loading

0 comments on commit a6b42bc

Please sign in to comment.