From b49bdeace87495a92bf03ef3a69e0fa34479aef4 Mon Sep 17 00:00:00 2001 From: Alexey Fomenko Date: Tue, 20 Aug 2024 19:44:17 +0300 Subject: [PATCH] Add HPA support to tei, teireranking, tgi services Signed-off-by: Alexey Fomenko --- .../chatqna/templates/customMetrics.yaml | 53 +++++ helm-charts/chatqna/values.yaml | 8 + helm-charts/common/tei/README.md | 74 ++++++- .../common/tei/templates/deployment.yaml | 7 + .../templates/horizontalPodAutoscaler.yaml | 51 +++++ .../common/tei/templates/servicemonitor.yaml | 17 ++ helm-charts/common/tei/values.yaml | 9 + helm-charts/common/teirerank/README.md | 74 ++++++- .../teirerank/templates/deployment.yaml | 7 + .../templates/horizontalPodAutoscaler.yaml | 51 +++++ .../teirerank/templates/servicemonitor.yaml | 17 ++ helm-charts/common/teirerank/values.yaml | 10 + helm-charts/common/tgi/README.md | 62 ++++++ .../common/tgi/templates/deployment.yaml | 7 + .../templates/horizontalPorAutoscaler.yaml | 51 +++++ .../common/tgi/templates/servicemonitor.yaml | 22 ++ helm-charts/common/tgi/values.yaml | 9 + .../config/HPA/customMetrics.yaml | 51 +++++ microservices-connector/config/HPA/tei.yaml | 205 ++++++++++++++++++ .../config/HPA/teirerank.yaml | 204 +++++++++++++++++ microservices-connector/config/HPA/tgi.yaml | 201 +++++++++++++++++ 21 files changed, 1178 insertions(+), 12 deletions(-) create mode 100644 helm-charts/chatqna/templates/customMetrics.yaml create mode 100644 helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/tei/templates/servicemonitor.yaml create mode 100644 helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/teirerank/templates/servicemonitor.yaml create mode 100644 helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml create mode 100644 helm-charts/common/tgi/templates/servicemonitor.yaml create mode 100644 microservices-connector/config/HPA/customMetrics.yaml create mode 100644 microservices-connector/config/HPA/tei.yaml create mode 100644 microservices-connector/config/HPA/teirerank.yaml create mode 100644 microservices-connector/config/HPA/tgi.yaml diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml new file mode 100644 index 00000000..64123df0 --- /dev/null +++ b/helm-charts/chatqna/templates/customMetrics.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring +{{- end }} diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index f848b209..a7a115f9 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -48,3 +48,11 @@ global: modelUseHostPath: "" # modelUseHostPath: /mnt/opea-models # modelUsePVC: model-volume + + # Enabling HorizontalPodAutoscaler (HPA) will: + # - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries + # for embedding, reranking, tgi services + # Upstream default configMap: + # - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml + horizontalPodAutoscaler: + enabled: false diff --git a/helm-charts/common/tei/README.md b/helm-charts/common/tei/README.md index 14d647f4..89174634 100644 --- a/helm-charts/common/tei/README.md +++ b/helm-charts/common/tei/README.md @@ -21,6 +21,38 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-base-en-v1.5" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +so that relevant custom metric queries are configured for PrometheusAdapter. + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -33,11 +65,41 @@ Open another terminal and run the following command to verify the service if wor curl http://localhost:2081/embed -X POST -d '{"inputs":"What is Deep Learning?"}' -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tei +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do + kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values -| Key | Type | Default | Description | -| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| EMBEDDING_MODEL_ID | string | `"BAAI/bge-base-en-v1.5"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | -| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | -| image.tag | string | `"cpu-1.5"` | | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| EMBEDDING_MODEL_ID | string | `"BAAI/bge-base-en-v1.5"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tei will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | +| image.tag | string | `"cpu-1.5"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/tei/templates/deployment.yaml b/helm-charts/common/tei/templates/deployment.yaml index 7467b9ab..fe56355f 100644 --- a/helm-charts/common/tei/templates/deployment.yaml +++ b/helm-charts/common/tei/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "tei.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "tei.selectorLabels" . | nindent 6 }} @@ -102,3 +105,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} diff --git a/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 00000000..a448b96c --- /dev/null +++ b/helm-charts/common/tei/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tei.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tei.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "tei.fullname" . }} + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/tei/templates/servicemonitor.yaml b/helm-charts/common/tei/templates/servicemonitor.yaml new file mode 100644 index 00000000..05c25528 --- /dev/null +++ b/helm-charts/common/tei/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tei.fullname" . }} +spec: + selector: + matchLabels: + {{- include "tei.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: tei + scheme: http +{{- end }} diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml index a9edda93..387de250 100644 --- a/helm-charts/common/tei/values.yaml +++ b/helm-charts/common/tei/values.yaml @@ -7,6 +7,9 @@ replicaCount: 1 +horizontalPodAutoscaler: + maxReplicas: 2 + port: 2081 shmSize: 1Gi EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" @@ -92,3 +95,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md index b3cb2f19..a74079e0 100644 --- a/helm-charts/common/teirerank/README.md +++ b/helm-charts/common/teirerank/README.md @@ -21,6 +21,38 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-reranker-base" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +so that relevant custom metric queries are configured for PrometheusAdapter. + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -36,11 +68,41 @@ curl http://localhost:2082/rerank \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*rerank +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do + kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values -| Key | Type | Default | Description | -| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| RERANK_MODEL_ID | string | `"BAAI/bge-reranker-base"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | -| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | -| image.tag | string | `"cpu-1.5"` | | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| RERANK_MODEL_ID | string | `"BAAI/bge-reranker-base"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | +| image.tag | string | `"cpu-1.5"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml index 4a85b7fc..45d2cc95 100644 --- a/helm-charts/common/teirerank/templates/deployment.yaml +++ b/helm-charts/common/teirerank/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "teirerank.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "teirerank.selectorLabels" . | nindent 6 }} @@ -102,3 +105,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} diff --git a/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 00000000..bb249305 --- /dev/null +++ b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "teirerank.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "teirerank.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "teirerank.fullname" . }} + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/teirerank/templates/servicemonitor.yaml b/helm-charts/common/teirerank/templates/servicemonitor.yaml new file mode 100644 index 00000000..52d355a7 --- /dev/null +++ b/helm-charts/common/teirerank/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "teirerank.fullname" . }} +spec: + selector: + matchLabels: + {{- include "teirerank.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: teirerank + scheme: http +{{- end }} diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml index 80a4cf73..01537c70 100644 --- a/helm-charts/common/teirerank/values.yaml +++ b/helm-charts/common/teirerank/values.yaml @@ -7,6 +7,10 @@ replicaCount: 1 + +horizontalPodAutoscaler: + maxReplicas: 3 + port: 2082 shmSize: 1Gi RERANK_MODEL_ID: "BAAI/bge-reranker-base" @@ -92,3 +96,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false diff --git a/helm-charts/common/tgi/README.md b/helm-charts/common/tgi/README.md index 62e4d70c..03f73f74 100644 --- a/helm-charts/common/tgi/README.md +++ b/helm-charts/common/tgi/README.md @@ -24,6 +24,38 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/models--bigscience--bloom-560m" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +`horizontalPodAutoscaler` enabled in top level Helm chart depending on this component (e.g. `chatqna`), +so that relevant custom metric queries are configured for PrometheusAdapter. + +### Gotchas + +Why HPA is opt-in: + +- Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +- `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +- By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +- Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -39,6 +71,35 @@ curl http://localhost:2080/generate \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tgi +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | cut -d/ -f2 | sort -u | tr -d '"'); do + kubectl get --raw $url/namespaces/$ns/metrics/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values | Key | Type | Default | Description | @@ -48,3 +109,4 @@ curl http://localhost:2080/generate \ | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"ghcr.io/huggingface/text-generation-inference"` | | | image.tag | string | `"1.4"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml index 2ef224b5..9587bcae 100644 --- a/helm-charts/common/tgi/templates/deployment.yaml +++ b/helm-charts/common/tgi/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "tgi.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "tgi.selectorLabels" . | nindent 6 }} @@ -94,3 +97,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 + {{- end }} diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml new file mode 100644 index 00000000..1131bbdc --- /dev/null +++ b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tgi.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tgi.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "tgi.fullname" . }} + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/tgi/templates/servicemonitor.yaml b/helm-charts/common/tgi/templates/servicemonitor.yaml new file mode 100644 index 00000000..0d7d6ffb --- /dev/null +++ b/helm-charts/common/tgi/templates/servicemonitor.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tgi.fullname" . }} +spec: + selector: + matchLabels: + {{- include "tgi.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: tgi + scheme: http +{{- end }} diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml index 659f0746..9aa6bae5 100644 --- a/helm-charts/common/tgi/values.yaml +++ b/helm-charts/common/tgi/values.yaml @@ -7,6 +7,9 @@ replicaCount: 1 +horizontalPodAutoscaler: + maxReplicas: 6 + port: 2080 image: @@ -117,3 +120,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false diff --git a/microservices-connector/config/HPA/customMetrics.yaml b/microservices-connector/config/HPA/customMetrics.yaml new file mode 100644 index 00000000..c3b5de05 --- /dev/null +++ b/microservices-connector/config/HPA/customMetrics.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring diff --git a/microservices-connector/config/HPA/tei.yaml b/microservices-connector/config/HPA/tei.yaml new file mode 100644 index 00000000..54c830e6 --- /dev/null +++ b/microservices-connector/config/HPA/tei.yaml @@ -0,0 +1,205 @@ +--- +# Source: tei/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tei-config + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-base-en-v1.5" + PORT: "2081" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + MAX_WARMUP_SEQUENCE_LENGTH: "512" +--- +# Source: tei/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: tei + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2081 + protocol: TCP + name: tei + selector: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei +--- +# Source: tei/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tei + labels: + helm.sh/chart: tei-0.8.0 + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + template: + metadata: + labels: + app.kubernetes.io/name: tei + app.kubernetes.io/instance: tei + spec: + securityContext: + {} + containers: + - name: tei + envFrom: + - configMapRef: + name: tei-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2081 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 +--- +# Source: tei/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tei +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tei + minReplicas: 1 + maxReplicas: 2 + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: tei/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tei +spec: + selector: + matchLabels: + app.kubernetes.io/name: tei + endpoints: + - interval: 4s + port: tei + scheme: http diff --git a/microservices-connector/config/HPA/teirerank.yaml b/microservices-connector/config/HPA/teirerank.yaml new file mode 100644 index 00000000..3cd33c14 --- /dev/null +++ b/microservices-connector/config/HPA/teirerank.yaml @@ -0,0 +1,204 @@ +--- +# Source: teirerank/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: teirerank-config + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "BAAI/bge-reranker-base" + PORT: "2082" + http_proxy: "" + https_proxy: "" + no_proxy: "" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" +--- +# Source: teirerank/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: teirerank + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2082 + protocol: TCP + name: teirerank + selector: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank +--- +# Source: teirerank/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: teirerank + labels: + helm.sh/chart: teirerank-0.8.0 + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + app.kubernetes.io/version: "cpu-1.5" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + template: + metadata: + labels: + app.kubernetes.io/name: teirerank + app.kubernetes.io/instance: teirerank + spec: + securityContext: + {} + containers: + - name: teirerank + envFrom: + - configMapRef: + name: teirerank-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" + imagePullPolicy: IfNotPresent + args: + - "--auto-truncate" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2082 + protocol: TCP + livenessProbe: + failureThreshold: 24 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + startupProbe: + failureThreshold: 120 + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 +--- +# Source: teirerank/templates/horizontalPodAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: teirerank +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: teirerank + minReplicas: 1 + maxReplicas: 3 + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: teirerank + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: teirerank/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: teirerank +spec: + selector: + matchLabels: + app.kubernetes.io/name: teirerank + endpoints: + - interval: 4s + port: teirerank + scheme: http diff --git a/microservices-connector/config/HPA/tgi.yaml b/microservices-connector/config/HPA/tgi.yaml new file mode 100644 index 00000000..97aedc76 --- /dev/null +++ b/microservices-connector/config/HPA/tgi.yaml @@ -0,0 +1,201 @@ +--- +# Source: tgi/templates/configmap.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: tgi-config + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +data: + MODEL_ID: "Intel/neural-chat-7b-v3-3" + PORT: "2080" + HF_TOKEN: "insert-your-huggingface-token-here" + http_proxy: "" + https_proxy: "" + no_proxy: "" + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + CUDA_GRAPHS: "0" +--- +# Source: tgi/templates/service.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: tgi + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 2080 + protocol: TCP + name: tgi + selector: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi +--- +# Source: tgi/templates/deployment.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tgi + labels: + helm.sh/chart: tgi-0.8.0 + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + app.kubernetes.io/version: "2.1.0" + app.kubernetes.io/managed-by: Helm +spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + selector: + matchLabels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + template: + metadata: + labels: + app.kubernetes.io/name: tgi + app.kubernetes.io/instance: tgi + spec: + securityContext: + {} + containers: + - name: tgi + envFrom: + - configMapRef: + name: tgi-config + - configMapRef: + name: extra-env-config + optional: true + securityContext: + {} + image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu" + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: 2080 + protocol: TCP + livenessProbe: + failureThreshold: 24 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + startupProbe: + failureThreshold: 120 + initialDelaySeconds: 5 + periodSeconds: 5 + tcpSocket: + port: http + resources: + {} + volumes: + - name: model-volume + hostPath: + path: /mnt/opea-models + type: Directory + - name: tmp + emptyDir: {} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 +--- +# Source: tgi/templates/horizontalPorAutoscaler.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: tgi +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: tgi + minReplicas: 1 + maxReplicas: 6 + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tgi + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +--- +# Source: tgi/templates/servicemonitor.yaml +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tgi +spec: + selector: + matchLabels: + app.kubernetes.io/name: tgi + endpoints: + - interval: 4s + port: tgi + scheme: http