-
Notifications
You must be signed in to change notification settings - Fork 52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add HPA support to ChatQnA #327
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.global.horizontalPodAutoscaler.enabled }} | ||
apiVersion: v1 | ||
data: | ||
config.yaml: | | ||
rules: | ||
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' | ||
# Average request latency from TGI histograms, over 1 min | ||
# (0.001 divider add is to make sure there's always a valid value) | ||
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^tgi_request_inference_duration_sum | ||
as: "tgi_request_latency" | ||
resources: | ||
# HPA needs both namespace + suitable object resource for its query paths: | ||
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency | ||
# (pod is not suitable object type for matching as each instance has different name) | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}' | ||
# Average request latency from TEI histograms, over 1 min | ||
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^te_request_inference_duration_sum | ||
as: "reranking_request_latency" | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}' | ||
# Average request latency from TEI histograms, over 1 min | ||
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))' | ||
name: | ||
matches: ^te_request_inference_duration_sum | ||
as: "embedding_request_latency" | ||
resources: | ||
overrides: | ||
namespace: | ||
resource: namespace | ||
service: | ||
resource: service | ||
kind: ConfigMap | ||
metadata: | ||
name: adapter-config | ||
namespace: monitoring | ||
{{- end }} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,3 +48,11 @@ global: | |
modelUseHostPath: "" | ||
# modelUseHostPath: /mnt/opea-models | ||
# modelUsePVC: model-volume | ||
|
||
# Enabling HorizontalPodAutoscaler (HPA) will: | ||
# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries | ||
# for embedding, reranking, tgi services | ||
# Upstream default configMap: | ||
# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml | ||
horizontalPodAutoscaler: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will need "HorizontalPodAutoscaler (HPA) support" section in
Verification section can be omitted I think, it's enough to have it in TGI & TEI READMEs. Options table entry description could be e.g:
|
||
enabled: false |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.global.horizontalPodAutoscaler.enabled }} | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: {{ include "tei.fullname" . }} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: {{ include "tei.fullname" . }} | ||
minReplicas: 1 | ||
maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} | ||
metrics: | ||
- type: Object | ||
object: | ||
metric: | ||
# tei-embedding time metrics are in seconds | ||
name: embedding_request_latency | ||
describedObject: | ||
apiVersion: v1 | ||
# get metric for named object of given type (in same namespace) | ||
kind: Service | ||
name: {{ include "tei.fullname" . }} | ||
target: | ||
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when | ||
# TEI startup + request processing takes longer than HPA evaluation period, this uses | ||
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type: | ||
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details | ||
type: Value | ||
value: 4 | ||
behavior: | ||
scaleDown: | ||
stabilizationWindowSeconds: 180 | ||
policies: | ||
- type: Percent | ||
value: 25 | ||
periodSeconds: 15 | ||
scaleUp: | ||
selectPolicy: Max | ||
stabilizationWindowSeconds: 0 | ||
policies: | ||
- type: Percent | ||
value: 50 | ||
periodSeconds: 15 | ||
- type: Pods | ||
value: 2 | ||
periodSeconds: 15 | ||
{{- end }} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if .Values.global.horizontalPodAutoscaler.enabled }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: ServiceMonitor | ||
metadata: | ||
name: {{ include "tei.fullname" . }} | ||
spec: | ||
selector: | ||
matchLabels: | ||
{{- include "tei.selectorLabels" . | nindent 6 }} | ||
endpoints: | ||
- interval: 4s | ||
port: tei | ||
scheme: http | ||
{{- end }} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This HPA support section is generic enough. I think maybe we can put it in one place instead of copy/pasting it:
https://github.com/opea-project/GenAIInfra/blob/main/helm-charts/README.md
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Thanks, @eero-t for preparing patch.