Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support alternative metrics on accelerated TGI / TEI instances #454

Merged
merged 2 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 71 additions & 50 deletions helm-charts/chatqna/templates/custom-metrics-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,54 +12,75 @@ metadata:
app.kubernetes.io/name: prometheus-adapter
data:
config.yaml: |
rules:
{{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
# check metric with:
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency | jq
#
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace:
resource: namespace
service:
resource: service
{{- end }}
{{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
{{- end }}
{{- if .Values.tei.horizontalPodAutoscaler.enabled }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
{{- end }}
rules:
{{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
# check metric with:
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
#
{{- if .Values.tgi.accelDevice }}
- seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# TGI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
name:
matches: ^tgi_queue_size
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
{{- end }}
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
{{- if .Values.teirerank.accelDevice }}
- seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# TEI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
name:
matches: ^te_queue_size
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- if .Values.tei.horizontalPodAutoscaler.enabled }}
{{- if .Values.tei.accelDevice }}
- seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# TEI instances queue_size sum
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
name:
matches: ^te_queue_size
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
{{- else }}
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
{{- end }}
resources:
overrides:
namespace: {resource: "namespace"}
service: {resource: "service"}
{{- end }}
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/codegen/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.1"
Expand Down
1 change: 1 addition & 0 deletions helm-charts/codetrans/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.1"
Expand Down
23 changes: 16 additions & 7 deletions helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,30 @@ spec:
metrics:
- type: Object
object:
metric:
# TEI time metrics are in seconds
name: {{ include "tei.metricPrefix" . }}_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: {{ include "tei.fullname" . }}
target:
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "tei.metricPrefix" . }}_queue_size_sum
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4
value: 4 # seconds
metric:
name: {{ include "tei.metricPrefix" . }}_request_latency
{{- end }}
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,30 @@ spec:
metrics:
- type: Object
object:
metric:
# TEI time metrics are in seconds
name: {{ include "teirerank.metricPrefix" . }}_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: {{ include "teirerank.fullname" . }}
target:
# reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
# TEI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4
value: 4 # seconds
metric:
name: {{ include "teirerank.metricPrefix" . }}_request_latency
{{- end }}
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
23 changes: 16 additions & 7 deletions helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,30 @@ spec:
metrics:
- type: Object
object:
metric:
# TGI time metrics are in seconds
name: {{ include "tgi.metricPrefix" . }}_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: {{ include "tgi.fullname" . }}
target:
# tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
# TGI startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
{{- if .Values.accelDevice }}
# Metric is sum from all pods. "AverageValue" divides value returned from
# the custom metrics API by the number of Pods before comparing to the target:
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
type: AverageValue
averageValue: 15
metric:
name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
{{- else }}
# Metric is average for all the pods. To avoid replica fluctuation when pod
# startup + request processing takes longer than HPA evaluation period, this uses
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
type: Value
value: 4
value: 4 # seconds
metric:
name: {{ include "tgi.metricPrefix" . }}_request_latency
{{- end }}
behavior:
scaleDown:
stabilizationWindowSeconds: 180
Expand Down
1 change: 1 addition & 0 deletions helm-charts/docsum/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

tgi:
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.0.1"
Expand Down